Data Module¶
Data loading and manipulation.
Loading Data¶
From Parquet¶
From CSV¶
From S3¶
From PostgreSQL¶
Rust
let df = DataFrame::from_sql(
"postgres://user:pass@host/db",
"SELECT * FROM prices WHERE date > '2020-01-01'"
)?;
DataFrame Operations¶
Column Access¶
Rust
// Get column
let prices = df.column("close")?;
// Get multiple columns
let subset = df.select(&["date", "ticker", "close"])?;
Filtering¶
Rust
// Filter rows
let filtered = df.filter(|row| row["market_cap"] > 1_000_000_000.0)?;
// Filter by date
let recent = df.filter_date_range("2020-01-01", "2024-12-31")?;
Grouping¶
Rust
// Group by column
let grouped = df.group_by("sector")?;
// Aggregate
let sector_avg = grouped.mean("close")?;
Joining¶
Rust
let prices = DataFrame::from_parquet("prices.parquet")?;
let fundamentals = DataFrame::from_parquet("fundamentals.parquet")?;
let joined = prices.join(&fundamentals, &["date", "ticker"])?;
Time Series Operations¶
Lag¶
Diff¶
Rolling¶
Data Validation¶
Check Missing¶
Rust
let missing = df.missing_values()?;
for (col, count) in missing {
println!("{}: {} missing", col, count);
}
Fill Missing¶
Rust
// Forward fill
let filled = df.forward_fill("close")?;
// Fill with value
let filled = df.fill_value("close", 0.0)?;
Remove Duplicates¶
Data Types¶
Column Types¶
Rust
use sigc::data::ColumnType;
let schema = df.schema();
for (name, dtype) in schema.iter() {
match dtype {
ColumnType::Numeric => println!("{}: Numeric", name),
ColumnType::Date => println!("{}: Date", name),
ColumnType::Symbol => println!("{}: Symbol", name),
ColumnType::Category => println!("{}: Category", name),
}
}
Type Conversion¶
Memory Mapping¶
Enable Memory Mapping¶
Configure¶
Rust
use sigc::data::MmapOptions;
let options = MmapOptions::default()
.max_size_gb(50)
.read_only(true);
let df = DataFrame::from_parquet_with_options("data.parquet", options)?;
Export¶
Rust
// To Parquet
df.to_parquet("output.parquet")?;
// To CSV
df.to_csv("output.csv")?;
// To Arrow
df.to_arrow("output.arrow")?;