Skip to content

Commit 63a5fd5

Browse files
author
Rahul Sharma
committed
change int96 internal format
1 parent 325d335 commit 63a5fd5

File tree

1 file changed

+46
-63
lines changed

1 file changed

+46
-63
lines changed

parquet/src/data_type.rs

Lines changed: 46 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@ use crate::util::bit_util::FromBytes;
3535
/// The type only takes 12 bytes, without extra padding.
3636
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
3737
pub struct Int96 {
38-
value: [u32; 3],
38+
/// First 8 bytes store nanoseconds since midnight
39+
pub nanos: i64,
40+
/// Last 4 bytes store Julian days
41+
pub days: i32,
3942
}
4043

4144
const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588;
@@ -59,19 +62,25 @@ const NANOSECONDS_IN_DAY: i64 = SECONDS_IN_DAY * NANOSECONDS;
5962
impl Int96 {
6063
/// Creates new INT96 type struct with no data set.
6164
pub fn new() -> Self {
62-
Self { value: [0; 3] }
65+
Self { nanos: 0, days: 0 }
6366
}
6467

65-
/// Returns underlying data as slice of [`u32`].
68+
/// Returns underlying data as slice of [`u32`] for compatibility with Parquet format
6669
#[inline]
6770
pub fn data(&self) -> &[u32] {
68-
&self.value
71+
// SAFETY: We're reinterpreting the bytes of our struct as [u32; 3]
72+
// This is safe because:
73+
// 1. The memory layout is compatible (12 bytes total)
74+
// 2. The alignment requirements are met (u32 requires 4-byte alignment)
75+
// 3. We maintain the invariant that the bytes are always valid u32s
76+
unsafe { std::slice::from_raw_parts(self as *const Int96 as *const u32, 3) }
6977
}
7078

71-
/// Sets data for this INT96 type.
79+
/// Sets data for this INT96 type from raw Parquet format.
7280
#[inline]
7381
pub fn set_data(&mut self, elem0: u32, elem1: u32, elem2: u32) {
74-
self.value = [elem0, elem1, elem2];
82+
self.nanos = ((elem1 as i64) << 32) | (elem0 as i64);
83+
self.days = elem2 as i32;
7584
}
7685

7786
/// Converts this INT96 into an i64 representing the number of MILLISECONDS since Epoch
@@ -81,8 +90,6 @@ impl Int96 {
8190
}
8291

8392
/// Converts this INT96 into an i64 representing the number of SECONDS since EPOCH
84-
///
85-
/// Will wrap around on overflow
8693
#[inline]
8794
pub fn to_seconds(&self) -> i64 {
8895
let (day, nanos) = self.data_as_days_and_nanos();
@@ -92,8 +99,6 @@ impl Int96 {
9299
}
93100

94101
/// Converts this INT96 into an i64 representing the number of MILLISECONDS since EPOCH
95-
///
96-
/// Will wrap around on overflow
97102
#[inline]
98103
pub fn to_millis(&self) -> i64 {
99104
let (day, nanos) = self.data_as_days_and_nanos();
@@ -103,8 +108,6 @@ impl Int96 {
103108
}
104109

105110
/// Converts this INT96 into an i64 representing the number of MICROSECONDS since EPOCH
106-
///
107-
/// Will wrap around on overflow
108111
#[inline]
109112
pub fn to_micros(&self) -> i64 {
110113
let (day, nanos) = self.data_as_days_and_nanos();
@@ -114,8 +117,6 @@ impl Int96 {
114117
}
115118

116119
/// Converts this INT96 into an i64 representing the number of NANOSECONDS since EPOCH
117-
///
118-
/// Will wrap around on overflow
119120
#[inline]
120121
pub fn to_nanos(&self) -> i64 {
121122
let (day, nanos) = self.data_as_days_and_nanos();
@@ -124,33 +125,19 @@ impl Int96 {
124125
.wrapping_add(nanos)
125126
}
126127

127-
/// Sets the INT96 data from seconds since epoch
128-
///
129-
/// Will wrap around on overflow
130-
#[inline]
131-
pub fn set_data_from_seconds(&mut self, seconds: i64) {
132-
self.set_data_from_nanos(seconds.wrapping_mul(NANOSECONDS));
133-
}
134-
135-
/// Sets the INT96 data from milliseconds since epoch
136-
///
137-
/// Will wrap around on overflow
128+
/// Sets the INT96 data directly from days and nanoseconds
138129
#[inline]
139-
pub fn set_data_from_millis(&mut self, millis: i64) {
140-
self.set_data_from_nanos(millis.wrapping_mul(MICROSECONDS));
130+
pub fn set_data_from_days_and_nanos(&mut self, days: i32, nanos: i64) {
131+
self.days = days;
132+
self.nanos = nanos;
141133
}
142134

143-
/// Sets the INT96 data from microseconds since epoch
144-
///
145-
/// Will wrap around on overflow
146135
#[inline]
147-
pub fn set_data_from_micros(&mut self, micros: i64) {
148-
self.set_data_from_nanos(micros.wrapping_mul(MILLISECONDS));
136+
fn data_as_days_and_nanos(&self) -> (i32, i64) {
137+
(self.days, self.nanos)
149138
}
150139

151140
/// Sets the INT96 data from nanoseconds since epoch
152-
///
153-
/// Will wrap around on overflow
154141
#[inline]
155142
pub fn set_data_from_nanos(&mut self, nanos: i64) {
156143
let days = nanos / NANOSECONDS_IN_DAY;
@@ -159,24 +146,32 @@ impl Int96 {
159146
self.set_data_from_days_and_nanos(julian_day, remaining_nanos);
160147
}
161148

162-
/// Sets the INT96 data directly from days and nanoseconds
163-
///
164-
/// This is the most direct way to set the Int96 data structure which internally
165-
/// stores days and nanoseconds. The days should be Julian days since epoch.
149+
/// Sets the INT96 data from seconds since epoch
150+
#[inline]
151+
pub fn set_data_from_seconds(&mut self, seconds: i64) {
152+
self.set_data_from_nanos(seconds.wrapping_mul(NANOSECONDS))
153+
}
166154

155+
/// Sets the INT96 data from milliseconds since epoch
167156
#[inline]
168-
pub fn set_data_from_days_and_nanos(&mut self, days: i32, nanos: i64) {
169-
let julian_day = (days as i32) as u32;
170-
let nanos_low = (nanos & 0xFFFFFFFF) as u32;
171-
let nanos_high = ((nanos >> 32) & 0xFFFFFFFF) as u32;
172-
self.set_data(nanos_low, nanos_high, julian_day);
157+
pub fn set_data_from_millis(&mut self, millis: i64) {
158+
self.set_data_from_nanos(millis.wrapping_mul(MICROSECONDS))
173159
}
174160

161+
/// Sets the INT96 data from microseconds since epoch
175162
#[inline]
176-
fn data_as_days_and_nanos(&self) -> (i32, i64) {
177-
let day = self.data()[2] as i32;
178-
let nanos = ((self.data()[1] as i64) << 32) + self.data()[0] as i64;
179-
(day, nanos)
163+
pub fn set_data_from_micros(&mut self, micros: i64) {
164+
self.set_data_from_nanos(micros.wrapping_mul(MILLISECONDS))
165+
}
166+
}
167+
168+
169+
impl From<Vec<u32>> for Int96 {
170+
fn from(buf: Vec<u32>) -> Self {
171+
assert_eq!(buf.len(), 3);
172+
let mut result = Self::new();
173+
result.set_data(buf[0], buf[1], buf[2]);
174+
result
180175
}
181176
}
182177

@@ -188,25 +183,13 @@ impl PartialOrd for Int96 {
188183

189184
impl Ord for Int96 {
190185
fn cmp(&self, other: &Self) -> Ordering {
191-
let (self_days, self_nanos) = self.data_as_days_and_nanos();
192-
let (other_days, other_nanos) = other.data_as_days_and_nanos();
193-
194-
match self_days.cmp(&other_days) {
195-
Ordering::Equal => self_nanos.cmp(&other_nanos),
186+
match self.days.cmp(&other.days) {
187+
Ordering::Equal => self.nanos.cmp(&other.nanos),
196188
ord => ord,
197189
}
198190
}
199191
}
200192

201-
impl From<Vec<u32>> for Int96 {
202-
fn from(buf: Vec<u32>) -> Self {
203-
assert_eq!(buf.len(), 3);
204-
let mut result = Self::new();
205-
result.set_data(buf[0], buf[1], buf[2]);
206-
result
207-
}
208-
}
209-
210193
impl fmt::Display for Int96 {
211194
#[cold]
212195
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
@@ -675,8 +658,8 @@ impl AsBytes for bool {
675658

676659
impl AsBytes for Int96 {
677660
fn as_bytes(&self) -> &[u8] {
678-
// SAFETY: Int96::data is a &[u32; 3].
679-
unsafe { std::slice::from_raw_parts(self.data() as *const [u32] as *const u8, 12) }
661+
// SAFETY: The layout of Int96 is i64 followed by i32, which is 12 contiguous bytes
662+
unsafe { std::slice::from_raw_parts(self as *const Int96 as *const u8, 12) }
680663
}
681664
}
682665

0 commit comments

Comments
 (0)