polars_core/frame/mod.rs
1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(any(feature = "rows", feature = "object"))]
33pub mod row;
34mod top_k;
35mod upstream_traits;
36mod validation;
37
38use arrow::record_batch::{RecordBatch, RecordBatchT};
39use polars_utils::pl_str::PlSmallStr;
40#[cfg(feature = "serde")]
41use serde::{Deserialize, Serialize};
42use strum_macros::IntoStaticStr;
43
44use crate::POOL;
45#[cfg(feature = "row_hash")]
46use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48use crate::series::IsSorted;
49
50#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
53#[strum(serialize_all = "snake_case")]
54pub enum UniqueKeepStrategy {
55 /// Keep the first unique row.
56 First,
57 /// Keep the last unique row.
58 Last,
59 /// Keep None of the unique rows.
60 None,
61 /// Keep any of the unique rows
62 /// This allows more optimizations
63 #[default]
64 Any,
65}
66
67fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
68where
69 F: for<'a> FnMut(&'a T) -> &'a str,
70{
71 // Always unique.
72 if items.len() <= 1 {
73 return Ok(());
74 }
75
76 if items.len() <= 4 {
77 // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
78 for i in 0..items.len() - 1 {
79 let name = get_name(&items[i]);
80 for other in items.iter().skip(i + 1) {
81 if name == get_name(other) {
82 polars_bail!(duplicate = name);
83 }
84 }
85 }
86 } else {
87 let mut names = PlHashSet::with_capacity(items.len());
88 for item in items {
89 let name = get_name(item);
90 if !names.insert(name) {
91 polars_bail!(duplicate = name);
92 }
93 }
94 }
95 Ok(())
96}
97
98/// A contiguous growable collection of `Series` that have the same length.
99///
100/// ## Use declarations
101///
102/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
103///
104/// ```rust
105/// use polars_core::prelude::*; // if the crate polars-core is used directly
106/// // use polars::prelude::*; if the crate polars is used
107/// ```
108///
109/// # Initialization
110/// ## Default
111///
112/// A `DataFrame` can be initialized empty:
113///
114/// ```rust
115/// # use polars_core::prelude::*;
116/// let df = DataFrame::default();
117/// assert!(df.is_empty());
118/// ```
119///
120/// ## Wrapping a `Vec<Series>`
121///
122/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
123///
124/// ```rust
125/// # use polars_core::prelude::*;
126/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
127/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
128///
129/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
130/// ```
131///
132/// ## Using a macro
133///
134/// The [`df!`] macro is a convenient method:
135///
136/// ```rust
137/// # use polars_core::prelude::*;
138/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
139/// "Color" => ["Red", "Yellow", "Green"]);
140/// ```
141///
142/// ## Using a CSV file
143///
144/// See the `polars_io::csv::CsvReader`.
145///
146/// # Indexing
147/// ## By a number
148///
149/// The `Index<usize>` is implemented for the `DataFrame`.
150///
151/// ```rust
152/// # use polars_core::prelude::*;
153/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
154/// "Color" => ["Red", "Yellow", "Green"])?;
155///
156/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
157/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
158/// # Ok::<(), PolarsError>(())
159/// ```
160///
161/// ## By a `Series` name
162///
163/// ```rust
164/// # use polars_core::prelude::*;
165/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
166/// "Color" => ["Red", "Yellow", "Green"])?;
167///
168/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
169/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
170/// # Ok::<(), PolarsError>(())
171/// ```
172#[derive(Clone)]
173pub struct DataFrame {
174 height: usize,
175 // invariant: columns[i].len() == height for each 0 >= i > columns.len()
176 pub(crate) columns: Vec<Column>,
177
178 /// A cached schema. This might not give correct results if the DataFrame was modified in place
179 /// between schema and reading.
180 cached_schema: OnceLock<SchemaRef>,
181}
182
183impl DataFrame {
184 pub fn clear_schema(&mut self) {
185 self.cached_schema = OnceLock::new();
186 }
187
188 #[inline]
189 pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
190 self.columns.iter()
191 }
192
193 #[inline]
194 pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
195 self.columns.iter().map(Column::as_materialized_series)
196 }
197
198 #[inline]
199 pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
200 self.columns.par_iter().map(Column::as_materialized_series)
201 }
202
203 /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
204 ///
205 /// # Implementation
206 /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
207 /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
208 /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
209 ///
210 /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
211 /// However, this function will yield a smaller number. This is because this function returns
212 /// the visible size of the buffer, not its total capacity.
213 ///
214 /// FFI buffers are included in this estimation.
215 pub fn estimated_size(&self) -> usize {
216 self.columns.iter().map(Column::estimated_size).sum()
217 }
218
219 // Reduce monomorphization.
220 fn try_apply_columns(
221 &self,
222 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
223 ) -> PolarsResult<Vec<Column>> {
224 self.columns.iter().map(func).collect()
225 }
226 // Reduce monomorphization.
227 pub fn _apply_columns(&self, func: &(dyn Fn(&Column) -> Column)) -> Vec<Column> {
228 self.columns.iter().map(func).collect()
229 }
230 // Reduce monomorphization.
231 fn try_apply_columns_par(
232 &self,
233 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
234 ) -> PolarsResult<Vec<Column>> {
235 POOL.install(|| self.columns.par_iter().map(func).collect())
236 }
237 // Reduce monomorphization.
238 pub fn _apply_columns_par(
239 &self,
240 func: &(dyn Fn(&Column) -> Column + Send + Sync),
241 ) -> Vec<Column> {
242 POOL.install(|| self.columns.par_iter().map(func).collect())
243 }
244
245 /// Get the index of the column.
246 fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
247 self.get_column_index(name)
248 .ok_or_else(|| polars_err!(col_not_found = name))
249 }
250
251 fn check_already_present(&self, name: &str) -> PolarsResult<()> {
252 polars_ensure!(
253 self.columns.iter().all(|s| s.name().as_str() != name),
254 Duplicate: "column with name {:?} is already present in the DataFrame", name
255 );
256 Ok(())
257 }
258
259 /// Reserve additional slots into the chunks of the series.
260 pub(crate) fn reserve_chunks(&mut self, additional: usize) {
261 for s in &mut self.columns {
262 if let Column::Series(s) = s {
263 // SAFETY:
264 // do not modify the data, simply resize.
265 unsafe { s.chunks_mut().reserve(additional) }
266 }
267 }
268 }
269
270 /// Create a DataFrame from a Vector of Series.
271 ///
272 /// Errors if a column names are not unique, or if heights are not all equal.
273 ///
274 /// # Example
275 ///
276 /// ```
277 /// # use polars_core::prelude::*;
278 /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
279 /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
280 ///
281 /// let df = DataFrame::new(vec![s0, s1])?;
282 /// # Ok::<(), PolarsError>(())
283 /// ```
284 pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
285 DataFrame::validate_columns_slice(&columns)
286 .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
287 Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
288 }
289
290 pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
291 for col in &columns {
292 polars_ensure!(
293 col.len() == height,
294 ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
295 columns[0].name(), height, col.name(), col.len()
296 );
297 }
298
299 Ok(DataFrame {
300 height,
301 columns,
302 cached_schema: OnceLock::new(),
303 })
304 }
305
306 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
307 /// columns to match the other columns.
308 pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
309 // The length of the longest non-unit length column determines the
310 // broadcast length. If all columns are unit-length the broadcast length
311 // is one.
312 let broadcast_len = columns
313 .iter()
314 .map(|s| s.len())
315 .filter(|l| *l != 1)
316 .max()
317 .unwrap_or(1);
318 Self::new_with_broadcast_len(columns, broadcast_len)
319 }
320
321 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
322 /// columns to broadcast_len.
323 pub fn new_with_broadcast_len(
324 columns: Vec<Column>,
325 broadcast_len: usize,
326 ) -> PolarsResult<Self> {
327 ensure_names_unique(&columns, |s| s.name().as_str())?;
328 unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
329 }
330
331 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
332 /// columns to match the other columns.
333 ///
334 /// # Safety
335 /// Does not check that the column names are unique (which they must be).
336 pub unsafe fn new_with_broadcast_no_namecheck(
337 mut columns: Vec<Column>,
338 broadcast_len: usize,
339 ) -> PolarsResult<Self> {
340 for col in &mut columns {
341 // Length not equal to the broadcast len, needs broadcast or is an error.
342 let len = col.len();
343 if len != broadcast_len {
344 if len != 1 {
345 let name = col.name().to_owned();
346 let extra_info =
347 if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
348 format!(" (matching column '{}')", c.name())
349 } else {
350 String::new()
351 };
352 polars_bail!(
353 ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
354 );
355 }
356 *col = col.new_from_index(0, broadcast_len);
357 }
358 }
359
360 let length = if columns.is_empty() { 0 } else { broadcast_len };
361
362 Ok(unsafe { DataFrame::new_no_checks(length, columns) })
363 }
364
365 pub fn new_from_index(&self, index: usize, height: usize) -> Self {
366 let cols = self.columns.iter().map(|c| c.new_from_index(index, height));
367 unsafe { Self::new_no_checks(height, cols.collect()) }
368 }
369
370 /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
371 ///
372 /// # Example
373 ///
374 /// ```rust
375 /// use polars_core::prelude::DataFrame;
376 /// static EMPTY: DataFrame = DataFrame::empty();
377 /// ```
378 pub const fn empty() -> Self {
379 Self::empty_with_height(0)
380 }
381
382 /// Creates an empty `DataFrame` with a specific `height`.
383 pub const fn empty_with_height(height: usize) -> Self {
384 DataFrame {
385 height,
386 columns: vec![],
387 cached_schema: OnceLock::new(),
388 }
389 }
390
391 /// Create an empty `DataFrame` with empty columns as per the `schema`.
392 pub fn empty_with_schema(schema: &Schema) -> Self {
393 let cols = schema
394 .iter()
395 .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
396 .collect();
397 unsafe { DataFrame::new_no_checks(0, cols) }
398 }
399
400 /// Create an empty `DataFrame` with empty columns as per the `schema`.
401 pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
402 let cols = schema
403 .iter_values()
404 .map(|fld| {
405 Column::from(Series::new_empty(
406 fld.name.clone(),
407 &(DataType::from_arrow_field(fld)),
408 ))
409 })
410 .collect();
411 unsafe { DataFrame::new_no_checks(0, cols) }
412 }
413
414 /// Create a new `DataFrame` with the given schema, only containing nulls.
415 pub fn full_null(schema: &Schema, height: usize) -> Self {
416 let columns = schema
417 .iter_fields()
418 .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
419 .collect();
420 unsafe { DataFrame::new_no_checks(height, columns) }
421 }
422
423 /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
424 ///
425 /// # Example
426 ///
427 /// ```rust
428 /// # use polars_core::prelude::*;
429 /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
430 /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
431 /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
432 ///
433 /// assert_eq!(df.pop(), Some(s2));
434 /// assert_eq!(df.pop(), Some(s1));
435 /// assert_eq!(df.pop(), None);
436 /// assert!(df.is_empty());
437 /// # Ok::<(), PolarsError>(())
438 /// ```
439 pub fn pop(&mut self) -> Option<Column> {
440 self.clear_schema();
441
442 self.columns.pop()
443 }
444
445 /// Add a new column at index 0 that counts the rows.
446 ///
447 /// # Example
448 ///
449 /// ```
450 /// # use polars_core::prelude::*;
451 /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
452 /// assert_eq!(df1.shape(), (4, 1));
453 ///
454 /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
455 /// assert_eq!(df2.shape(), (4, 2));
456 /// println!("{}", df2);
457 ///
458 /// # Ok::<(), PolarsError>(())
459 /// ```
460 ///
461 /// Output:
462 ///
463 /// ```text
464 /// shape: (4, 2)
465 /// +-----+----------+
466 /// | Id | Name |
467 /// | --- | --- |
468 /// | u32 | str |
469 /// +=====+==========+
470 /// | 0 | James |
471 /// +-----+----------+
472 /// | 1 | Mary |
473 /// +-----+----------+
474 /// | 2 | John |
475 /// +-----+----------+
476 /// | 3 | Patricia |
477 /// +-----+----------+
478 /// ```
479 pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
480 let mut columns = Vec::with_capacity(self.columns.len() + 1);
481 let offset = offset.unwrap_or(0);
482
483 let col = Column::new_row_index(name, offset, self.height())?;
484 columns.push(col);
485 columns.extend_from_slice(&self.columns);
486 DataFrame::new(columns)
487 }
488
489 /// Add a row index column in place.
490 ///
491 /// # Safety
492 /// The caller should ensure the DataFrame does not already contain a column with the given name.
493 ///
494 /// # Panics
495 /// Panics if the resulting column would reach or overflow IdxSize::MAX.
496 pub unsafe fn with_row_index_mut(
497 &mut self,
498 name: PlSmallStr,
499 offset: Option<IdxSize>,
500 ) -> &mut Self {
501 // TODO: Make this function unsafe
502 debug_assert!(
503 self.columns.iter().all(|c| c.name() != &name),
504 "with_row_index_mut(): column with name {} already exists",
505 &name
506 );
507
508 let offset = offset.unwrap_or(0);
509 let col = Column::new_row_index(name, offset, self.height()).unwrap();
510
511 self.clear_schema();
512 self.columns.insert(0, col);
513 self
514 }
515
516 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
517 /// `Series`.
518 ///
519 /// Calculates the height from the first column or `0` if no columns are given.
520 ///
521 /// # Safety
522 ///
523 /// It is the callers responsibility to uphold the contract of all `Series`
524 /// having an equal length and a unique name, if not this may panic down the line.
525 pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
526 let height = columns.first().map_or(0, Column::len);
527 unsafe { Self::new_no_checks(height, columns) }
528 }
529
530 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
531 /// `Series`.
532 ///
533 /// It is advised to use [DataFrame::new] in favor of this method.
534 ///
535 /// # Safety
536 ///
537 /// It is the callers responsibility to uphold the contract of all `Series`
538 /// having an equal length and a unique name, if not this may panic down the line.
539 pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
540 if cfg!(debug_assertions) {
541 DataFrame::validate_columns_slice(&columns).unwrap();
542 }
543
544 unsafe { Self::_new_no_checks_impl(height, columns) }
545 }
546
547 /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
548 /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
549 /// constructed with this method is generally highly unsafe and should not be long-lived.
550 #[allow(clippy::missing_safety_doc)]
551 pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
552 DataFrame {
553 height,
554 columns,
555 cached_schema: OnceLock::new(),
556 }
557 }
558
559 /// Shrink the capacity of this DataFrame to fit its length.
560 pub fn shrink_to_fit(&mut self) {
561 // Don't parallelize this. Memory overhead
562 for s in &mut self.columns {
563 s.shrink_to_fit();
564 }
565 }
566
567 /// Aggregate all the chunks in the DataFrame to a single chunk.
568 pub fn as_single_chunk(&mut self) -> &mut Self {
569 // Don't parallelize this. Memory overhead
570 for s in &mut self.columns {
571 *s = s.rechunk();
572 }
573 self
574 }
575
576 /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
577 /// This may lead to more peak memory consumption.
578 pub fn as_single_chunk_par(&mut self) -> &mut Self {
579 if self.columns.iter().any(|c| c.n_chunks() > 1) {
580 self.columns = self._apply_columns_par(&|s| s.rechunk());
581 }
582 self
583 }
584
585 /// Rechunks all columns to only have a single chunk.
586 pub fn rechunk_mut(&mut self) {
587 // SAFETY: We never adjust the length or names of the columns.
588 let columns = unsafe { self.get_columns_mut() };
589
590 for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
591 *col = col.rechunk();
592 }
593 }
594
595 pub fn _deshare_views_mut(&mut self) {
596 // SAFETY: We never adjust the length or names of the columns.
597 unsafe {
598 let columns = self.get_columns_mut();
599 for col in columns {
600 let Column::Series(s) = col else { continue };
601
602 if let Ok(ca) = s.binary() {
603 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
604 *col = Column::from(gc_ca.into_series());
605 } else if let Ok(ca) = s.str() {
606 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
607 *col = Column::from(gc_ca.into_series());
608 }
609 }
610 }
611 }
612
613 /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
614 pub fn rechunk_to_record_batch(
615 self,
616 compat_level: CompatLevel,
617 ) -> RecordBatchT<Box<dyn Array>> {
618 let height = self.height();
619
620 let (schema, arrays) = self
621 .columns
622 .into_iter()
623 .map(|col| {
624 let mut series = col.take_materialized_series();
625 // Rechunk to one chunk if necessary
626 if series.n_chunks() > 1 {
627 series = series.rechunk();
628 }
629 (
630 series.field().to_arrow(compat_level),
631 series.to_arrow(0, compat_level),
632 )
633 })
634 .collect();
635
636 RecordBatchT::new(height, Arc::new(schema), arrays)
637 }
638
639 /// Returns true if the chunks of the columns do not align and re-chunking should be done
640 pub fn should_rechunk(&self) -> bool {
641 // Fast check. It is also needed for correctness, as code below doesn't check if the number
642 // of chunks is equal.
643 if !self
644 .get_columns()
645 .iter()
646 .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
647 .all_equal()
648 {
649 return true;
650 }
651
652 // From here we check chunk lengths.
653 let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
654 match chunk_lengths.next() {
655 None => false,
656 Some(first_column_chunk_lengths) => {
657 // Fast Path for single Chunk Series
658 if first_column_chunk_lengths.size_hint().0 == 1 {
659 return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
660 }
661 // Always rechunk if we have more chunks than rows.
662 // except when we have an empty df containing a single chunk
663 let height = self.height();
664 let n_chunks = first_column_chunk_lengths.size_hint().0;
665 if n_chunks > height && !(height == 0 && n_chunks == 1) {
666 return true;
667 }
668 // Slow Path for multi Chunk series
669 let v: Vec<_> = first_column_chunk_lengths.collect();
670 for cl in chunk_lengths {
671 if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
672 return true;
673 }
674 }
675 false
676 },
677 }
678 }
679
680 /// Ensure all the chunks in the [`DataFrame`] are aligned.
681 pub fn align_chunks_par(&mut self) -> &mut Self {
682 if self.should_rechunk() {
683 self.as_single_chunk_par()
684 } else {
685 self
686 }
687 }
688
689 pub fn align_chunks(&mut self) -> &mut Self {
690 if self.should_rechunk() {
691 self.as_single_chunk()
692 } else {
693 self
694 }
695 }
696
697 /// Get the [`DataFrame`] schema.
698 ///
699 /// # Example
700 ///
701 /// ```rust
702 /// # use polars_core::prelude::*;
703 /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
704 /// "Diameter (m)" => [8.8e26, f64::INFINITY])?;
705 ///
706 /// let f1: Field = Field::new("Thing".into(), DataType::String);
707 /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
708 /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
709 ///
710 /// assert_eq!(&**df.schema(), &sc);
711 /// # Ok::<(), PolarsError>(())
712 /// ```
713 pub fn schema(&self) -> &SchemaRef {
714 let out = self.cached_schema.get_or_init(|| {
715 Arc::new(
716 self.columns
717 .iter()
718 .map(|x| (x.name().clone(), x.dtype().clone()))
719 .collect(),
720 )
721 });
722
723 debug_assert_eq!(out.len(), self.width());
724
725 out
726 }
727
728 /// Get a reference to the [`DataFrame`] columns.
729 ///
730 /// # Example
731 ///
732 /// ```rust
733 /// # use polars_core::prelude::*;
734 /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
735 /// "Symbol" => ["A", "C", "G", "T"])?;
736 /// let columns: &[Column] = df.get_columns();
737 ///
738 /// assert_eq!(columns[0].name(), "Name");
739 /// assert_eq!(columns[1].name(), "Symbol");
740 /// # Ok::<(), PolarsError>(())
741 /// ```
742 #[inline]
743 pub fn get_columns(&self) -> &[Column] {
744 &self.columns
745 }
746
747 #[inline]
748 /// Get mutable access to the underlying columns.
749 ///
750 /// # Safety
751 ///
752 /// The caller must ensure the length of all [`Series`] remains equal to `height` or
753 /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
754 /// The caller must ensure that the cached schema is cleared if it modifies the schema by
755 /// calling [`DataFrame::clear_schema`].
756 pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
757 &mut self.columns
758 }
759
760 #[inline]
761 /// Remove all the columns in the [`DataFrame`] but keep the `height`.
762 pub fn clear_columns(&mut self) {
763 unsafe { self.get_columns_mut() }.clear();
764 self.clear_schema();
765 }
766
767 #[inline]
768 /// Extend the columns without checking for name collisions or height.
769 ///
770 /// # Safety
771 ///
772 /// The caller needs to ensure that:
773 /// - Column names are unique within the resulting [`DataFrame`].
774 /// - The length of each appended column matches the height of the [`DataFrame`]. For
775 /// `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
776 /// with [`DataFrame::set_height`].
777 pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
778 unsafe { self.get_columns_mut() }.extend(iter);
779 self.clear_schema();
780 }
781
782 /// Take ownership of the underlying columns vec.
783 pub fn take_columns(self) -> Vec<Column> {
784 self.columns
785 }
786
787 /// Iterator over the columns as [`Series`].
788 ///
789 /// # Example
790 ///
791 /// ```rust
792 /// # use polars_core::prelude::*;
793 /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
794 /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
795 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
796 ///
797 /// let mut iterator = df.iter();
798 ///
799 /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
800 /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
801 /// assert_eq!(iterator.next(), None);
802 /// # Ok::<(), PolarsError>(())
803 /// ```
804 pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
805 self.materialized_column_iter()
806 }
807
808 /// # Example
809 ///
810 /// ```rust
811 /// # use polars_core::prelude::*;
812 /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
813 /// "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
814 ///
815 /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
816 /// # Ok::<(), PolarsError>(())
817 /// ```
818 pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
819 self.columns.iter().map(|s| s.name()).collect()
820 }
821
822 /// Get the [`Vec<PlSmallStr>`] representing the column names.
823 pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
824 self.columns.iter().map(|s| s.name().clone()).collect()
825 }
826
827 pub fn get_column_names_str(&self) -> Vec<&str> {
828 self.columns.iter().map(|s| s.name().as_str()).collect()
829 }
830
831 /// Set the column names.
832 /// # Example
833 ///
834 /// ```rust
835 /// # use polars_core::prelude::*;
836 /// let mut df: DataFrame = df!("Mathematical set" => ["ā", "ā¤", "š»", "ā", "ā", "ā"])?;
837 /// df.set_column_names(["Set"])?;
838 ///
839 /// assert_eq!(df.get_column_names(), &["Set"]);
840 /// # Ok::<(), PolarsError>(())
841 /// ```
842 pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
843 where
844 I: IntoIterator<Item = S>,
845 S: Into<PlSmallStr>,
846 {
847 let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
848 self._set_column_names_impl(names.as_slice())
849 }
850
851 fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
852 polars_ensure!(
853 names.len() == self.width(),
854 ShapeMismatch: "{} column names provided for a DataFrame of width {}",
855 names.len(), self.width()
856 );
857 ensure_names_unique(names, |s| s.as_str())?;
858
859 let columns = mem::take(&mut self.columns);
860 self.columns = columns
861 .into_iter()
862 .zip(names)
863 .map(|(s, name)| {
864 let mut s = s;
865 s.rename(name.clone());
866 s
867 })
868 .collect();
869 self.clear_schema();
870 Ok(())
871 }
872
873 /// Get the data types of the columns in the [`DataFrame`].
874 ///
875 /// # Example
876 ///
877 /// ```rust
878 /// # use polars_core::prelude::*;
879 /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
880 /// "Fraction" => [0.965, 0.035])?;
881 ///
882 /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
883 /// # Ok::<(), PolarsError>(())
884 /// ```
885 pub fn dtypes(&self) -> Vec<DataType> {
886 self.columns.iter().map(|s| s.dtype().clone()).collect()
887 }
888
889 pub(crate) fn first_series_column(&self) -> Option<&Series> {
890 self.columns.iter().find_map(|col| col.as_series())
891 }
892
893 /// The number of chunks for the first column.
894 pub fn first_col_n_chunks(&self) -> usize {
895 match self.first_series_column() {
896 None if self.columns.is_empty() => 0,
897 None => 1,
898 Some(s) => s.n_chunks(),
899 }
900 }
901
902 /// The highest number of chunks for any column.
903 pub fn max_n_chunks(&self) -> usize {
904 self.columns
905 .iter()
906 .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
907 .max()
908 .unwrap_or(0)
909 }
910
911 /// Get a reference to the schema fields of the [`DataFrame`].
912 ///
913 /// # Example
914 ///
915 /// ```rust
916 /// # use polars_core::prelude::*;
917 /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
918 /// "Fraction" => [0.708, 0.292])?;
919 ///
920 /// let f1: Field = Field::new("Surface type".into(), DataType::String);
921 /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
922 ///
923 /// assert_eq!(earth.fields(), &[f1, f2]);
924 /// # Ok::<(), PolarsError>(())
925 /// ```
926 pub fn fields(&self) -> Vec<Field> {
927 self.columns
928 .iter()
929 .map(|s| s.field().into_owned())
930 .collect()
931 }
932
933 /// Get (height, width) of the [`DataFrame`].
934 ///
935 /// # Example
936 ///
937 /// ```rust
938 /// # use polars_core::prelude::*;
939 /// let df0: DataFrame = DataFrame::default();
940 /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
941 /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
942 /// "2" => [1, 2, 3, 4, 5])?;
943 ///
944 /// assert_eq!(df0.shape(), (0 ,0));
945 /// assert_eq!(df1.shape(), (5, 1));
946 /// assert_eq!(df2.shape(), (5, 2));
947 /// # Ok::<(), PolarsError>(())
948 /// ```
949 pub fn shape(&self) -> (usize, usize) {
950 (self.height, self.columns.len())
951 }
952
953 /// Get the width of the [`DataFrame`] which is the number of columns.
954 ///
955 /// # Example
956 ///
957 /// ```rust
958 /// # use polars_core::prelude::*;
959 /// let df0: DataFrame = DataFrame::default();
960 /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
961 /// let df2: DataFrame = df!("Series 1" => [0; 0],
962 /// "Series 2" => [0; 0])?;
963 ///
964 /// assert_eq!(df0.width(), 0);
965 /// assert_eq!(df1.width(), 1);
966 /// assert_eq!(df2.width(), 2);
967 /// # Ok::<(), PolarsError>(())
968 /// ```
969 pub fn width(&self) -> usize {
970 self.columns.len()
971 }
972
973 /// Get the height of the [`DataFrame`] which is the number of rows.
974 ///
975 /// # Example
976 ///
977 /// ```rust
978 /// # use polars_core::prelude::*;
979 /// let df0: DataFrame = DataFrame::default();
980 /// let df1: DataFrame = df!("Currency" => ["ā¬", "$"])?;
981 /// let df2: DataFrame = df!("Currency" => ["ā¬", "$", "Ā„", "Ā£", "āæ"])?;
982 ///
983 /// assert_eq!(df0.height(), 0);
984 /// assert_eq!(df1.height(), 2);
985 /// assert_eq!(df2.height(), 5);
986 /// # Ok::<(), PolarsError>(())
987 /// ```
988 pub fn height(&self) -> usize {
989 self.height
990 }
991
992 /// Returns the size as number of rows * number of columns
993 pub fn size(&self) -> usize {
994 let s = self.shape();
995 s.0 * s.1
996 }
997
998 /// Returns `true` if the [`DataFrame`] contains no rows.
999 ///
1000 /// # Example
1001 ///
1002 /// ```rust
1003 /// # use polars_core::prelude::*;
1004 /// let df1: DataFrame = DataFrame::default();
1005 /// assert!(df1.is_empty());
1006 ///
1007 /// let df2: DataFrame = df!("First name" => ["Forever"],
1008 /// "Last name" => ["Alone"])?;
1009 /// assert!(!df2.is_empty());
1010 /// # Ok::<(), PolarsError>(())
1011 /// ```
1012 pub fn is_empty(&self) -> bool {
1013 matches!(self.shape(), (0, _) | (_, 0))
1014 }
1015
1016 /// Set the height (i.e. number of rows) of this [`DataFrame`].
1017 ///
1018 /// # Safety
1019 ///
1020 /// This needs to be equal to the length of all the columns.
1021 pub unsafe fn set_height(&mut self, height: usize) {
1022 self.height = height;
1023 }
1024
1025 /// Add multiple [`Series`] to a [`DataFrame`].
1026 /// The added `Series` are required to have the same length.
1027 ///
1028 /// # Example
1029 ///
1030 /// ```rust
1031 /// # use polars_core::prelude::*;
1032 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1033 /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1034 /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1035 ///
1036 /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1037 /// assert_eq!(df2.shape(), (3, 3));
1038 /// println!("{}", df2);
1039 /// # Ok::<(), PolarsError>(())
1040 /// ```
1041 ///
1042 /// Output:
1043 ///
1044 /// ```text
1045 /// shape: (3, 3)
1046 /// +---------+--------+----------+
1047 /// | Element | Proton | Electron |
1048 /// | --- | --- | --- |
1049 /// | str | i32 | i32 |
1050 /// +=========+========+==========+
1051 /// | Copper | 29 | 29 |
1052 /// +---------+--------+----------+
1053 /// | Silver | 47 | 47 |
1054 /// +---------+--------+----------+
1055 /// | Gold | 79 | 79 |
1056 /// +---------+--------+----------+
1057 /// ```
1058 pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1059 let mut new_cols = self.columns.clone();
1060 new_cols.extend_from_slice(columns);
1061 DataFrame::new(new_cols)
1062 }
1063
1064 /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1065 ///
1066 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1067 ///
1068 /// # Example
1069 ///
1070 /// ```rust
1071 /// # use polars_core::prelude::*;
1072 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1073 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1074 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1075 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1076 ///
1077 /// let df3: DataFrame = df1.vstack(&df2)?;
1078 ///
1079 /// assert_eq!(df3.shape(), (5, 2));
1080 /// println!("{}", df3);
1081 /// # Ok::<(), PolarsError>(())
1082 /// ```
1083 ///
1084 /// Output:
1085 ///
1086 /// ```text
1087 /// shape: (5, 2)
1088 /// +-----------+-------------------+
1089 /// | Element | Melting Point (K) |
1090 /// | --- | --- |
1091 /// | str | f64 |
1092 /// +===========+===================+
1093 /// | Copper | 1357.77 |
1094 /// +-----------+-------------------+
1095 /// | Silver | 1234.93 |
1096 /// +-----------+-------------------+
1097 /// | Gold | 1337.33 |
1098 /// +-----------+-------------------+
1099 /// | Platinum | 2041.4 |
1100 /// +-----------+-------------------+
1101 /// | Palladium | 1828.05 |
1102 /// +-----------+-------------------+
1103 /// ```
1104 pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1105 let mut df = self.clone();
1106 df.vstack_mut(other)?;
1107 Ok(df)
1108 }
1109
1110 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1111 ///
1112 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1113 ///
1114 /// # Example
1115 ///
1116 /// ```rust
1117 /// # use polars_core::prelude::*;
1118 /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1119 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1120 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1121 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1122 ///
1123 /// df1.vstack_mut(&df2)?;
1124 ///
1125 /// assert_eq!(df1.shape(), (5, 2));
1126 /// println!("{}", df1);
1127 /// # Ok::<(), PolarsError>(())
1128 /// ```
1129 ///
1130 /// Output:
1131 ///
1132 /// ```text
1133 /// shape: (5, 2)
1134 /// +-----------+-------------------+
1135 /// | Element | Melting Point (K) |
1136 /// | --- | --- |
1137 /// | str | f64 |
1138 /// +===========+===================+
1139 /// | Copper | 1357.77 |
1140 /// +-----------+-------------------+
1141 /// | Silver | 1234.93 |
1142 /// +-----------+-------------------+
1143 /// | Gold | 1337.33 |
1144 /// +-----------+-------------------+
1145 /// | Platinum | 2041.4 |
1146 /// +-----------+-------------------+
1147 /// | Palladium | 1828.05 |
1148 /// +-----------+-------------------+
1149 /// ```
1150 pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1151 if self.width() != other.width() {
1152 polars_ensure!(
1153 self.width() == 0,
1154 ShapeMismatch:
1155 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1156 self.width(), other.width(),
1157 );
1158 self.columns.clone_from(&other.columns);
1159 self.height = other.height;
1160 return Ok(self);
1161 }
1162
1163 self.columns
1164 .iter_mut()
1165 .zip(other.columns.iter())
1166 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1167 ensure_can_extend(&*left, right)?;
1168 left.append(right).map_err(|e| {
1169 e.context(format!("failed to vstack column '{}'", right.name()).into())
1170 })?;
1171 Ok(())
1172 })?;
1173 self.height += other.height;
1174 Ok(self)
1175 }
1176
1177 pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1178 if self.width() != other.width() {
1179 polars_ensure!(
1180 self.width() == 0,
1181 ShapeMismatch:
1182 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1183 self.width(), other.width(),
1184 );
1185 self.columns = other.columns;
1186 self.height = other.height;
1187 return Ok(self);
1188 }
1189
1190 self.columns
1191 .iter_mut()
1192 .zip(other.columns.into_iter())
1193 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1194 ensure_can_extend(&*left, &right)?;
1195 let right_name = right.name().clone();
1196 left.append_owned(right).map_err(|e| {
1197 e.context(format!("failed to vstack column '{right_name}'").into())
1198 })?;
1199 Ok(())
1200 })?;
1201 self.height += other.height;
1202 Ok(self)
1203 }
1204
1205 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1206 ///
1207 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1208 ///
1209 /// # Panics
1210 /// Panics if the schema's don't match.
1211 pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1212 self.columns
1213 .iter_mut()
1214 .zip(other.columns.iter())
1215 .for_each(|(left, right)| {
1216 left.append(right)
1217 .map_err(|e| {
1218 e.context(format!("failed to vstack column '{}'", right.name()).into())
1219 })
1220 .expect("should not fail");
1221 });
1222 self.height += other.height;
1223 }
1224
1225 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1226 ///
1227 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1228 ///
1229 /// # Panics
1230 /// Panics if the schema's don't match.
1231 pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1232 self.columns
1233 .iter_mut()
1234 .zip(other.columns)
1235 .for_each(|(left, right)| {
1236 left.append_owned(right).expect("should not fail");
1237 });
1238 self.height += other.height;
1239 }
1240
1241 /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1242 ///
1243 /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1244 /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1245 ///
1246 /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1247 /// and thus will yield faster queries.
1248 ///
1249 /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1250 /// online operations where you add `n` rows and rerun a query.
1251 ///
1252 /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1253 /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1254 /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1255 pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1256 polars_ensure!(
1257 self.width() == other.width(),
1258 ShapeMismatch:
1259 "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1260 self.width(), other.width(),
1261 );
1262
1263 self.columns
1264 .iter_mut()
1265 .zip(other.columns.iter())
1266 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1267 ensure_can_extend(&*left, right)?;
1268 left.extend(right).map_err(|e| {
1269 e.context(format!("failed to extend column '{}'", right.name()).into())
1270 })?;
1271 Ok(())
1272 })?;
1273 self.height += other.height;
1274 self.clear_schema();
1275 Ok(())
1276 }
1277
1278 /// Remove a column by name and return the column removed.
1279 ///
1280 /// # Example
1281 ///
1282 /// ```rust
1283 /// # use polars_core::prelude::*;
1284 /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1285 /// "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1286 ///
1287 /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1288 /// assert!(s1.is_err());
1289 ///
1290 /// let s2: Column = df.drop_in_place("Animal")?;
1291 /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1292 /// # Ok::<(), PolarsError>(())
1293 /// ```
1294 pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1295 let idx = self.check_name_to_idx(name)?;
1296 self.clear_schema();
1297 Ok(self.columns.remove(idx))
1298 }
1299
1300 /// Return a new [`DataFrame`] where all null values are dropped.
1301 ///
1302 /// # Example
1303 ///
1304 /// ```no_run
1305 /// # use polars_core::prelude::*;
1306 /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1307 /// "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1308 /// assert_eq!(df1.shape(), (3, 2));
1309 ///
1310 /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1311 /// assert_eq!(df2.shape(), (1, 2));
1312 /// println!("{}", df2);
1313 /// # Ok::<(), PolarsError>(())
1314 /// ```
1315 ///
1316 /// Output:
1317 ///
1318 /// ```text
1319 /// shape: (1, 2)
1320 /// +---------+---------------------+
1321 /// | Country | Tax revenue (% GDP) |
1322 /// | --- | --- |
1323 /// | str | f64 |
1324 /// +=========+=====================+
1325 /// | Malta | 32.7 |
1326 /// +---------+---------------------+
1327 /// ```
1328 pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1329 where
1330 for<'a> &'a S: Into<PlSmallStr>,
1331 {
1332 if let Some(v) = subset {
1333 let v = self.select_columns(v)?;
1334 self._drop_nulls_impl(v.as_slice())
1335 } else {
1336 self._drop_nulls_impl(self.columns.as_slice())
1337 }
1338 }
1339
1340 fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1341 // fast path for no nulls in df
1342 if subset.iter().all(|s| !s.has_nulls()) {
1343 return Ok(self.clone());
1344 }
1345
1346 let mut iter = subset.iter();
1347
1348 let mask = iter
1349 .next()
1350 .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1351 let mut mask = mask.is_not_null();
1352
1353 for c in iter {
1354 mask = mask & c.is_not_null();
1355 }
1356 self.filter(&mask)
1357 }
1358
1359 /// Drop a column by name.
1360 /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1361 /// the current one in place.
1362 ///
1363 /// # Example
1364 ///
1365 /// ```rust
1366 /// # use polars_core::prelude::*;
1367 /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1368 /// let df2: DataFrame = df1.drop("Ray type")?;
1369 ///
1370 /// assert!(df2.is_empty());
1371 /// # Ok::<(), PolarsError>(())
1372 /// ```
1373 pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1374 let idx = self.check_name_to_idx(name)?;
1375 let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1376
1377 self.columns.iter().enumerate().for_each(|(i, s)| {
1378 if i != idx {
1379 new_cols.push(s.clone())
1380 }
1381 });
1382
1383 Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1384 }
1385
1386 /// Drop columns that are in `names`.
1387 pub fn drop_many<I, S>(&self, names: I) -> Self
1388 where
1389 I: IntoIterator<Item = S>,
1390 S: Into<PlSmallStr>,
1391 {
1392 let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1393 self.drop_many_amortized(&names)
1394 }
1395
1396 /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1397 pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1398 if names.is_empty() {
1399 return self.clone();
1400 }
1401 let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1402 self.columns.iter().for_each(|s| {
1403 if !names.contains(s.name()) {
1404 new_cols.push(s.clone())
1405 }
1406 });
1407
1408 unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1409 }
1410
1411 /// Insert a new column at a given index without checking for duplicates.
1412 /// This can leave the [`DataFrame`] at an invalid state
1413 fn insert_column_no_name_check(
1414 &mut self,
1415 index: usize,
1416 column: Column,
1417 ) -> PolarsResult<&mut Self> {
1418 polars_ensure!(
1419 self.width() == 0 || column.len() == self.height(),
1420 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1421 column.len(), self.height(),
1422 );
1423
1424 if self.width() == 0 {
1425 self.height = column.len();
1426 }
1427
1428 self.columns.insert(index, column);
1429 self.clear_schema();
1430 Ok(self)
1431 }
1432
1433 /// Insert a new column at a given index.
1434 pub fn insert_column<S: IntoColumn>(
1435 &mut self,
1436 index: usize,
1437 column: S,
1438 ) -> PolarsResult<&mut Self> {
1439 let column = column.into_column();
1440 self.check_already_present(column.name().as_str())?;
1441 self.insert_column_no_name_check(index, column)
1442 }
1443
1444 fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1445 if let Some(idx) = self.get_column_index(column.name().as_str()) {
1446 self.replace_column(idx, column)?;
1447 } else {
1448 if self.width() == 0 {
1449 self.height = column.len();
1450 }
1451
1452 self.columns.push(column);
1453 self.clear_schema();
1454 }
1455 Ok(())
1456 }
1457
1458 /// Add a new column to this [`DataFrame`] or replace an existing one.
1459 pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1460 fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1461 let height = df.height();
1462 if column.len() == 1 && height > 1 {
1463 column = column.new_from_index(0, height);
1464 }
1465
1466 if column.len() == height || df.get_columns().is_empty() {
1467 df.add_column_by_search(column)?;
1468 Ok(df)
1469 }
1470 // special case for literals
1471 else if height == 0 && column.len() == 1 {
1472 let s = column.clear();
1473 df.add_column_by_search(s)?;
1474 Ok(df)
1475 } else {
1476 polars_bail!(
1477 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1478 column.len(), height,
1479 );
1480 }
1481 }
1482 let column = column.into_column();
1483 inner(self, column)
1484 }
1485
1486 /// Adds a column to the [`DataFrame`] without doing any checks
1487 /// on length or duplicates.
1488 ///
1489 /// # Safety
1490 /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1491 pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1492 debug_assert!(self.width() == 0 || self.height() == column.len());
1493 debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1494
1495 // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1496 // properly for `width` == 0.
1497 if self.width() == 0 {
1498 unsafe { self.set_height(column.len()) };
1499 }
1500 unsafe { self.get_columns_mut() }.push(column);
1501 self.clear_schema();
1502
1503 self
1504 }
1505
1506 // Note: Schema can be both input or output_schema
1507 fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1508 let name = c.name();
1509 if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1510 if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1511 // Given schema is output_schema and we can push.
1512 if idx == self.columns.len() {
1513 if self.width() == 0 {
1514 self.height = c.len();
1515 }
1516
1517 self.columns.push(c);
1518 self.clear_schema();
1519 }
1520 // Schema is incorrect fallback to search
1521 else {
1522 debug_assert!(false);
1523 self.add_column_by_search(c)?;
1524 }
1525 } else {
1526 self.replace_column(idx, c)?;
1527 }
1528 } else {
1529 if self.width() == 0 {
1530 self.height = c.len();
1531 }
1532
1533 self.columns.push(c);
1534 self.clear_schema();
1535 }
1536
1537 Ok(())
1538 }
1539
1540 // Note: Schema can be both input or output_schema
1541 pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1542 for (i, s) in series.into_iter().enumerate() {
1543 // we need to branch here
1544 // because users can add multiple columns with the same name
1545 if i == 0 || schema.get(s.name().as_str()).is_some() {
1546 self.with_column_and_schema(s.into_column(), schema)?;
1547 } else {
1548 self.with_column(s.clone().into_column())?;
1549 }
1550 }
1551 Ok(())
1552 }
1553
1554 pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1555 for (i, s) in columns.into_iter().enumerate() {
1556 // we need to branch here
1557 // because users can add multiple columns with the same name
1558 if i == 0 || schema.get(s.name().as_str()).is_some() {
1559 self.with_column_and_schema(s, schema)?;
1560 } else {
1561 self.with_column(s.clone())?;
1562 }
1563 }
1564
1565 Ok(())
1566 }
1567
1568 /// Add a new column to this [`DataFrame`] or replace an existing one.
1569 /// Uses an existing schema to amortize lookups.
1570 /// If the schema is incorrect, we will fallback to linear search.
1571 ///
1572 /// Note: Schema can be both input or output_schema
1573 pub fn with_column_and_schema<C: IntoColumn>(
1574 &mut self,
1575 column: C,
1576 schema: &Schema,
1577 ) -> PolarsResult<&mut Self> {
1578 let mut column = column.into_column();
1579
1580 let height = self.height();
1581 if column.len() == 1 && height > 1 {
1582 column = column.new_from_index(0, height);
1583 }
1584
1585 if column.len() == height || self.columns.is_empty() {
1586 self.add_column_by_schema(column, schema)?;
1587 Ok(self)
1588 }
1589 // special case for literals
1590 else if height == 0 && column.len() == 1 {
1591 let s = column.clear();
1592 self.add_column_by_schema(s, schema)?;
1593 Ok(self)
1594 } else {
1595 polars_bail!(
1596 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1597 column.len(), height,
1598 );
1599 }
1600 }
1601
1602 /// Get a row in the [`DataFrame`]. Beware this is slow.
1603 ///
1604 /// # Example
1605 ///
1606 /// ```
1607 /// # use polars_core::prelude::*;
1608 /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1609 /// df.get(idx)
1610 /// }
1611 /// ```
1612 pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {
1613 match self.columns.first() {
1614 Some(s) => {
1615 if s.len() <= idx {
1616 return None;
1617 }
1618 },
1619 None => return None,
1620 }
1621 // SAFETY: we just checked bounds
1622 unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1623 }
1624
1625 /// Select a [`Series`] by index.
1626 ///
1627 /// # Example
1628 ///
1629 /// ```rust
1630 /// # use polars_core::prelude::*;
1631 /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1632 /// "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1633 ///
1634 /// let s1: Option<&Column> = df.select_at_idx(0);
1635 /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1636 ///
1637 /// assert_eq!(s1, Some(&s2));
1638 /// # Ok::<(), PolarsError>(())
1639 /// ```
1640 pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1641 self.columns.get(idx)
1642 }
1643
1644 /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1645 ///
1646 /// # Examples
1647 ///
1648 /// ```rust
1649 /// # use polars_core::prelude::*;
1650 /// let df = df! {
1651 /// "0" => [0, 0, 0],
1652 /// "1" => [1, 1, 1],
1653 /// "2" => [2, 2, 2]
1654 /// }?;
1655 ///
1656 /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1657 /// assert!(df.equals(&df.select_by_range(..)?));
1658 /// # Ok::<(), PolarsError>(())
1659 /// ```
1660 pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1661 where
1662 R: ops::RangeBounds<usize>,
1663 {
1664 // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1665 // because it is the nightly feature. We should change here if this function were stable.
1666 fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1667 where
1668 R: ops::RangeBounds<usize>,
1669 {
1670 let len = bounds.end;
1671
1672 let start: ops::Bound<&usize> = range.start_bound();
1673 let start = match start {
1674 ops::Bound::Included(&start) => start,
1675 ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1676 panic!("attempted to index slice from after maximum usize");
1677 }),
1678 ops::Bound::Unbounded => 0,
1679 };
1680
1681 let end: ops::Bound<&usize> = range.end_bound();
1682 let end = match end {
1683 ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1684 panic!("attempted to index slice up to maximum usize");
1685 }),
1686 ops::Bound::Excluded(&end) => end,
1687 ops::Bound::Unbounded => len,
1688 };
1689
1690 if start > end {
1691 panic!("slice index starts at {start} but ends at {end}");
1692 }
1693 if end > len {
1694 panic!("range end index {end} out of range for slice of length {len}",);
1695 }
1696
1697 ops::Range { start, end }
1698 }
1699
1700 let colnames = self.get_column_names_owned();
1701 let range = get_range(range, ..colnames.len());
1702
1703 self._select_impl(&colnames[range])
1704 }
1705
1706 /// Get column index of a [`Series`] by name.
1707 /// # Example
1708 ///
1709 /// ```rust
1710 /// # use polars_core::prelude::*;
1711 /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1712 /// "Health" => [100, 200, 500],
1713 /// "Mana" => [250, 100, 0],
1714 /// "Strength" => [30, 150, 300])?;
1715 ///
1716 /// assert_eq!(df.get_column_index("Name"), Some(0));
1717 /// assert_eq!(df.get_column_index("Health"), Some(1));
1718 /// assert_eq!(df.get_column_index("Mana"), Some(2));
1719 /// assert_eq!(df.get_column_index("Strength"), Some(3));
1720 /// assert_eq!(df.get_column_index("Haste"), None);
1721 /// # Ok::<(), PolarsError>(())
1722 /// ```
1723 pub fn get_column_index(&self, name: &str) -> Option<usize> {
1724 let schema = self.schema();
1725 if let Some(idx) = schema.index_of(name) {
1726 if self
1727 .get_columns()
1728 .get(idx)
1729 .is_some_and(|c| c.name() == name)
1730 {
1731 return Some(idx);
1732 }
1733 }
1734
1735 self.columns.iter().position(|s| s.name().as_str() == name)
1736 }
1737
1738 /// Get column index of a [`Series`] by name.
1739 pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1740 self.get_column_index(name)
1741 .ok_or_else(|| polars_err!(col_not_found = name))
1742 }
1743
1744 /// Select a single column by name.
1745 ///
1746 /// # Example
1747 ///
1748 /// ```rust
1749 /// # use polars_core::prelude::*;
1750 /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1751 /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1752 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1753 ///
1754 /// assert_eq!(df.column("Password")?, &s1);
1755 /// # Ok::<(), PolarsError>(())
1756 /// ```
1757 pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1758 let idx = self.try_get_column_index(name)?;
1759 Ok(self.select_at_idx(idx).unwrap())
1760 }
1761
1762 /// Selected multiple columns by name.
1763 ///
1764 /// # Example
1765 ///
1766 /// ```rust
1767 /// # use polars_core::prelude::*;
1768 /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1769 /// "Max weight (kg)" => [16.0, 35.89])?;
1770 /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1771 ///
1772 /// assert_eq!(&df[0], sv[0]);
1773 /// assert_eq!(&df[1], sv[1]);
1774 /// # Ok::<(), PolarsError>(())
1775 /// ```
1776 pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1777 where
1778 I: IntoIterator<Item = S>,
1779 S: AsRef<str>,
1780 {
1781 names
1782 .into_iter()
1783 .map(|name| self.column(name.as_ref()))
1784 .collect()
1785 }
1786
1787 /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1788 ///
1789 /// # Examples
1790 ///
1791 /// ```
1792 /// # use polars_core::prelude::*;
1793 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1794 /// df.select(["foo", "bar"])
1795 /// }
1796 /// ```
1797 pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1798 where
1799 I: IntoIterator<Item = S>,
1800 S: Into<PlSmallStr>,
1801 {
1802 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1803 self._select_impl(cols.as_slice())
1804 }
1805
1806 pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1807 ensure_names_unique(cols, |s| s.as_str())?;
1808 self._select_impl_unchecked(cols)
1809 }
1810
1811 pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1812 let selected = self.select_columns_impl(cols)?;
1813 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1814 }
1815
1816 /// Select with a known schema. The schema names must match the column names of this DataFrame.
1817 pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1818 where
1819 I: IntoIterator<Item = S>,
1820 S: Into<PlSmallStr>,
1821 {
1822 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1823 self._select_with_schema_impl(&cols, schema, true)
1824 }
1825
1826 /// Select with a known schema without checking for duplicates in `selection`.
1827 /// The schema names must match the column names of this DataFrame.
1828 pub fn select_with_schema_unchecked<I, S>(
1829 &self,
1830 selection: I,
1831 schema: &Schema,
1832 ) -> PolarsResult<Self>
1833 where
1834 I: IntoIterator<Item = S>,
1835 S: Into<PlSmallStr>,
1836 {
1837 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1838 self._select_with_schema_impl(&cols, schema, false)
1839 }
1840
1841 /// * The schema names must match the column names of this DataFrame.
1842 pub fn _select_with_schema_impl(
1843 &self,
1844 cols: &[PlSmallStr],
1845 schema: &Schema,
1846 check_duplicates: bool,
1847 ) -> PolarsResult<Self> {
1848 if check_duplicates {
1849 ensure_names_unique(cols, |s| s.as_str())?;
1850 }
1851
1852 let selected = self.select_columns_impl_with_schema(cols, schema)?;
1853 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1854 }
1855
1856 /// A non generic implementation to reduce compiler bloat.
1857 fn select_columns_impl_with_schema(
1858 &self,
1859 cols: &[PlSmallStr],
1860 schema: &Schema,
1861 ) -> PolarsResult<Vec<Column>> {
1862 if cfg!(debug_assertions) {
1863 ensure_matching_schema_names(schema, self.schema())?;
1864 }
1865
1866 cols.iter()
1867 .map(|name| {
1868 let index = schema.try_get_full(name.as_str())?.0;
1869 Ok(self.columns[index].clone())
1870 })
1871 .collect()
1872 }
1873
1874 pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1875 where
1876 I: IntoIterator<Item = S>,
1877 S: Into<PlSmallStr>,
1878 {
1879 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1880 self.select_physical_impl(&cols)
1881 }
1882
1883 fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1884 ensure_names_unique(cols, |s| s.as_str())?;
1885 let selected = self.select_columns_physical_impl(cols)?;
1886 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1887 }
1888
1889 pub fn project(&self, to: SchemaRef) -> PolarsResult<Self> {
1890 let from = self.schema();
1891 let columns = to
1892 .iter_names()
1893 .map(|name| Ok(self.columns[from.try_index_of(name.as_str())?].clone()))
1894 .collect::<PolarsResult<Vec<_>>>()?;
1895 let mut df = unsafe { Self::new_no_checks(self.height(), columns) };
1896 df.cached_schema = to.into();
1897 Ok(df)
1898 }
1899
1900 /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1901 ///
1902 /// # Example
1903 ///
1904 /// ```rust
1905 /// # use polars_core::prelude::*;
1906 /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1907 /// "Carbon" => [1, 2, 3],
1908 /// "Hydrogen" => [4, 6, 8])?;
1909 /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1910 ///
1911 /// assert_eq!(df["Carbon"], sv[0]);
1912 /// assert_eq!(df["Hydrogen"], sv[1]);
1913 /// # Ok::<(), PolarsError>(())
1914 /// ```
1915 pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1916 let cols = selection.into_vec();
1917 self.select_columns_impl(&cols)
1918 }
1919
1920 fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1921 self.columns
1922 .iter()
1923 .enumerate()
1924 .map(|(i, s)| (s.name().as_str(), i))
1925 .collect()
1926 }
1927
1928 /// A non generic implementation to reduce compiler bloat.
1929 fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1930 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1931 let name_to_idx = self._names_to_idx_map();
1932 cols.iter()
1933 .map(|name| {
1934 let idx = *name_to_idx
1935 .get(name.as_str())
1936 .ok_or_else(|| polars_err!(col_not_found = name))?;
1937 Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1938 })
1939 .collect::<PolarsResult<Vec<_>>>()?
1940 } else {
1941 cols.iter()
1942 .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1943 .collect::<PolarsResult<Vec<_>>>()?
1944 };
1945
1946 Ok(selected)
1947 }
1948
1949 /// A non generic implementation to reduce compiler bloat.
1950 fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1951 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1952 // we hash, because there are user that having millions of columns.
1953 // # https://github.com/pola-rs/polars/issues/1023
1954 let name_to_idx = self._names_to_idx_map();
1955
1956 cols.iter()
1957 .map(|name| {
1958 let idx = *name_to_idx
1959 .get(name.as_str())
1960 .ok_or_else(|| polars_err!(col_not_found = name))?;
1961 Ok(self.select_at_idx(idx).unwrap().clone())
1962 })
1963 .collect::<PolarsResult<Vec<_>>>()?
1964 } else {
1965 cols.iter()
1966 .map(|c| self.column(c.as_str()).cloned())
1967 .collect::<PolarsResult<Vec<_>>>()?
1968 };
1969
1970 Ok(selected)
1971 }
1972
1973 fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1974 // If there is a filtered column just see how many columns there are left.
1975 if let Some(fst) = filtered.first() {
1976 return fst.len();
1977 }
1978
1979 // Otherwise, count the number of values that would be filtered and return that height.
1980 let num_trues = mask.num_trues();
1981 if mask.len() == self.height() {
1982 num_trues
1983 } else {
1984 // This is for broadcasting masks
1985 debug_assert!(num_trues == 0 || num_trues == 1);
1986 self.height() * num_trues
1987 }
1988 }
1989
1990 /// Take the [`DataFrame`] rows by a boolean mask.
1991 ///
1992 /// # Example
1993 ///
1994 /// ```
1995 /// # use polars_core::prelude::*;
1996 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1997 /// let mask = df.column("sepal_width")?.is_not_null();
1998 /// df.filter(&mask)
1999 /// }
2000 /// ```
2001 pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2002 let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
2003 let height = self.filter_height(&new_col, mask);
2004
2005 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2006 }
2007
2008 /// Same as `filter` but does not parallelize.
2009 pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2010 let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
2011 let height = self.filter_height(&new_col, mask);
2012
2013 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2014 }
2015
2016 /// Take [`DataFrame`] rows by index values.
2017 ///
2018 /// # Example
2019 ///
2020 /// ```
2021 /// # use polars_core::prelude::*;
2022 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2023 /// let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2024 /// df.take(&idx)
2025 /// }
2026 /// ```
2027 pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2028 let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2029
2030 Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2031 }
2032
2033 /// # Safety
2034 /// The indices must be in-bounds.
2035 pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2036 self.take_unchecked_impl(idx, true)
2037 }
2038
2039 /// # Safety
2040 /// The indices must be in-bounds.
2041 pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2042 let cols = if allow_threads {
2043 POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))
2044 } else {
2045 self._apply_columns(&|s| s.take_unchecked(idx))
2046 };
2047 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2048 }
2049
2050 /// # Safety
2051 /// The indices must be in-bounds.
2052 pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2053 self.take_slice_unchecked_impl(idx, true)
2054 }
2055
2056 /// # Safety
2057 /// The indices must be in-bounds.
2058 pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2059 let cols = if allow_threads {
2060 POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))
2061 } else {
2062 self._apply_columns(&|s| s.take_slice_unchecked(idx))
2063 };
2064 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2065 }
2066
2067 /// Rename a column in the [`DataFrame`].
2068 ///
2069 /// # Example
2070 ///
2071 /// ```
2072 /// # use polars_core::prelude::*;
2073 /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2074 /// let original_name = "foo";
2075 /// let new_name = "bar";
2076 /// df.rename(original_name, new_name.into())
2077 /// }
2078 /// ```
2079 pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2080 if column == name.as_str() {
2081 return Ok(self);
2082 }
2083 polars_ensure!(
2084 !self.schema().contains(&name),
2085 Duplicate: "column rename attempted with already existing name \"{name}\""
2086 );
2087
2088 self.get_column_index(column)
2089 .and_then(|idx| self.columns.get_mut(idx))
2090 .ok_or_else(|| polars_err!(col_not_found = column))
2091 .map(|c| c.rename(name))?;
2092 self.clear_schema();
2093
2094 Ok(self)
2095 }
2096
2097 /// Sort [`DataFrame`] in place.
2098 ///
2099 /// See [`DataFrame::sort`] for more instruction.
2100 pub fn sort_in_place(
2101 &mut self,
2102 by: impl IntoVec<PlSmallStr>,
2103 sort_options: SortMultipleOptions,
2104 ) -> PolarsResult<&mut Self> {
2105 let by_column = self.select_columns(by)?;
2106 self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2107 Ok(self)
2108 }
2109
2110 #[doc(hidden)]
2111 /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2112 pub fn sort_impl(
2113 &self,
2114 by_column: Vec<Column>,
2115 mut sort_options: SortMultipleOptions,
2116 slice: Option<(i64, usize)>,
2117 ) -> PolarsResult<Self> {
2118 if by_column.is_empty() {
2119 // If no columns selected, any order (including original order) is correct.
2120 return if let Some((offset, len)) = slice {
2121 Ok(self.slice(offset, len))
2122 } else {
2123 Ok(self.clone())
2124 };
2125 }
2126
2127 // note that the by_column argument also contains evaluated expression from
2128 // polars-lazy that may not even be present in this dataframe. therefore
2129 // when we try to set the first columns as sorted, we ignore the error as
2130 // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2131 let first_descending = sort_options.descending[0];
2132 let first_by_column = by_column[0].name().to_string();
2133
2134 let set_sorted = |df: &mut DataFrame| {
2135 // Mark the first sort column as sorted; if the column does not exist it
2136 // is ok, because we sorted by an expression not present in the dataframe
2137 let _ = df.apply(&first_by_column, |s| {
2138 let mut s = s.clone();
2139 if first_descending {
2140 s.set_sorted_flag(IsSorted::Descending)
2141 } else {
2142 s.set_sorted_flag(IsSorted::Ascending)
2143 }
2144 s
2145 });
2146 };
2147 if self.is_empty() {
2148 let mut out = self.clone();
2149 set_sorted(&mut out);
2150 return Ok(out);
2151 }
2152
2153 if let Some((0, k)) = slice {
2154 if k < self.len() {
2155 return self.bottom_k_impl(k, by_column, sort_options);
2156 }
2157 }
2158 // Check if the required column is already sorted; if so we can exit early
2159 // We can do so when there is only one column to sort by, for multiple columns
2160 // it will be complicated to do so
2161 #[cfg(feature = "dtype-categorical")]
2162 let is_not_categorical_enum =
2163 !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2164 || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2165
2166 #[cfg(not(feature = "dtype-categorical"))]
2167 #[allow(non_upper_case_globals)]
2168 const is_not_categorical_enum: bool = true;
2169
2170 if by_column.len() == 1 && is_not_categorical_enum {
2171 let required_sorting = if sort_options.descending[0] {
2172 IsSorted::Descending
2173 } else {
2174 IsSorted::Ascending
2175 };
2176 // If null count is 0 then nulls_last doesnt matter
2177 // Safe to get value at last position since the dataframe is not empty (taken care above)
2178 let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2179 && ((by_column[0].null_count() == 0)
2180 || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2181 == sort_options.nulls_last[0]);
2182
2183 if no_sorting_required {
2184 return if let Some((offset, len)) = slice {
2185 Ok(self.slice(offset, len))
2186 } else {
2187 Ok(self.clone())
2188 };
2189 }
2190 }
2191
2192 let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2193
2194 // a lot of indirection in both sorting and take
2195 let mut df = self.clone();
2196 let df = df.as_single_chunk_par();
2197 let mut take = match (by_column.len(), has_nested) {
2198 (1, false) => {
2199 let s = &by_column[0];
2200 let options = SortOptions {
2201 descending: sort_options.descending[0],
2202 nulls_last: sort_options.nulls_last[0],
2203 multithreaded: sort_options.multithreaded,
2204 maintain_order: sort_options.maintain_order,
2205 limit: sort_options.limit,
2206 };
2207 // fast path for a frame with a single series
2208 // no need to compute the sort indices and then take by these indices
2209 // simply sort and return as frame
2210 if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2211 let mut out = s.sort_with(options)?;
2212 if let Some((offset, len)) = slice {
2213 out = out.slice(offset, len);
2214 }
2215 return Ok(out.into_frame());
2216 }
2217 s.arg_sort(options)
2218 },
2219 _ => {
2220 if sort_options.nulls_last.iter().all(|&x| x)
2221 || has_nested
2222 || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2223 {
2224 argsort_multiple_row_fmt(
2225 &by_column,
2226 sort_options.descending,
2227 sort_options.nulls_last,
2228 sort_options.multithreaded,
2229 )?
2230 } else {
2231 let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2232 first
2233 .as_materialized_series()
2234 .arg_sort_multiple(&other, &sort_options)?
2235 }
2236 },
2237 };
2238
2239 if let Some((offset, len)) = slice {
2240 take = take.slice(offset, len);
2241 }
2242
2243 // SAFETY:
2244 // the created indices are in bounds
2245 let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2246 set_sorted(&mut df);
2247 Ok(df)
2248 }
2249
2250 /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2251 ///
2252 /// This dataframe does not necessarily have a specified schema and may be changed at any
2253 /// point. It is primarily used for debugging.
2254 pub fn _to_metadata(&self) -> DataFrame {
2255 let num_columns = self.columns.len();
2256
2257 let mut column_names =
2258 StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2259 let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2260 let mut sorted_asc_ca =
2261 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2262 let mut sorted_dsc_ca =
2263 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2264 let mut fast_explode_list_ca =
2265 BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2266 let mut materialized_at_ca =
2267 StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2268
2269 for col in &self.columns {
2270 let flags = col.get_flags();
2271
2272 let (repr, materialized_at) = match col {
2273 Column::Series(s) => ("series", s.materialized_at()),
2274 Column::Partitioned(_) => ("partitioned", None),
2275 Column::Scalar(_) => ("scalar", None),
2276 };
2277 let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2278 let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2279 let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2280
2281 column_names.append_value(col.name().clone());
2282 repr_ca.append_value(repr);
2283 sorted_asc_ca.append_value(sorted_asc);
2284 sorted_dsc_ca.append_value(sorted_dsc);
2285 fast_explode_list_ca.append_value(fast_explode_list);
2286 materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2287 }
2288
2289 unsafe {
2290 DataFrame::new_no_checks(
2291 self.width(),
2292 vec![
2293 column_names.finish().into_column(),
2294 repr_ca.finish().into_column(),
2295 sorted_asc_ca.finish().into_column(),
2296 sorted_dsc_ca.finish().into_column(),
2297 fast_explode_list_ca.finish().into_column(),
2298 materialized_at_ca.finish().into_column(),
2299 ],
2300 )
2301 }
2302 }
2303
2304 /// Return a sorted clone of this [`DataFrame`].
2305 ///
2306 /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2307 /// # Example
2308 ///
2309 /// Sort by a single column with default options:
2310 /// ```
2311 /// # use polars_core::prelude::*;
2312 /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2313 /// df.sort(["sepal_width"], Default::default())
2314 /// }
2315 /// ```
2316 /// Sort by a single column with specific order:
2317 /// ```
2318 /// # use polars_core::prelude::*;
2319 /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2320 /// df.sort(
2321 /// ["sepal_width"],
2322 /// SortMultipleOptions::new()
2323 /// .with_order_descending(descending)
2324 /// )
2325 /// }
2326 /// ```
2327 /// Sort by multiple columns with specifying order for each column:
2328 /// ```
2329 /// # use polars_core::prelude::*;
2330 /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2331 /// df.sort(
2332 /// ["sepal_width", "sepal_length"],
2333 /// SortMultipleOptions::new()
2334 /// .with_order_descending_multi([false, true])
2335 /// )
2336 /// }
2337 /// ```
2338 /// See [`SortMultipleOptions`] for more options.
2339 ///
2340 /// Also see [`DataFrame::sort_in_place`].
2341 pub fn sort(
2342 &self,
2343 by: impl IntoVec<PlSmallStr>,
2344 sort_options: SortMultipleOptions,
2345 ) -> PolarsResult<Self> {
2346 let mut df = self.clone();
2347 df.sort_in_place(by, sort_options)?;
2348 Ok(df)
2349 }
2350
2351 /// Replace a column with a [`Series`].
2352 ///
2353 /// # Example
2354 ///
2355 /// ```rust
2356 /// # use polars_core::prelude::*;
2357 /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2358 /// "Area (km²)" => [9_833_520, 9_596_961])?;
2359 /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2360 ///
2361 /// assert!(df.replace("Nation", s.clone()).is_err());
2362 /// assert!(df.replace("Country", s).is_ok());
2363 /// # Ok::<(), PolarsError>(())
2364 /// ```
2365 pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2366 self.apply(column, |_| new_col.into_series())
2367 }
2368
2369 /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2370 /// is that now the value of `column: &str` determines the name of the column and not the name
2371 /// of the `Series` passed to this method.
2372 pub fn replace_or_add<S: IntoSeries>(
2373 &mut self,
2374 column: PlSmallStr,
2375 new_col: S,
2376 ) -> PolarsResult<&mut Self> {
2377 let mut new_col = new_col.into_series();
2378 new_col.rename(column);
2379 self.with_column(new_col)
2380 }
2381
2382 /// Replace column at index `idx` with a [`Series`].
2383 ///
2384 /// # Example
2385 ///
2386 /// ```ignored
2387 /// # use polars_core::prelude::*;
2388 /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2389 /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2390 /// let mut df = DataFrame::new(vec![s0, s1])?;
2391 ///
2392 /// // Add 32 to get lowercase ascii values
2393 /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2394 /// # Ok::<(), PolarsError>(())
2395 /// ```
2396 pub fn replace_column<C: IntoColumn>(
2397 &mut self,
2398 index: usize,
2399 new_column: C,
2400 ) -> PolarsResult<&mut Self> {
2401 polars_ensure!(
2402 index < self.width(),
2403 ShapeMismatch:
2404 "unable to replace at index {}, the DataFrame has only {} columns",
2405 index, self.width(),
2406 );
2407 let mut new_column = new_column.into_column();
2408 polars_ensure!(
2409 new_column.len() == self.height(),
2410 ShapeMismatch:
2411 "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2412 new_column.len(), self.height(),
2413 );
2414 let old_col = &mut self.columns[index];
2415 mem::swap(old_col, &mut new_column);
2416 self.clear_schema();
2417 Ok(self)
2418 }
2419
2420 /// Apply a closure to a column. This is the recommended way to do in place modification.
2421 ///
2422 /// # Example
2423 ///
2424 /// ```rust
2425 /// # use polars_core::prelude::*;
2426 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2427 /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2428 /// let mut df = DataFrame::new(vec![s0, s1])?;
2429 ///
2430 /// fn str_to_len(str_val: &Column) -> Column {
2431 /// str_val.str()
2432 /// .unwrap()
2433 /// .into_iter()
2434 /// .map(|opt_name: Option<&str>| {
2435 /// opt_name.map(|name: &str| name.len() as u32)
2436 /// })
2437 /// .collect::<UInt32Chunked>()
2438 /// .into_column()
2439 /// }
2440 ///
2441 /// // Replace the names column by the length of the names.
2442 /// df.apply("names", str_to_len);
2443 /// # Ok::<(), PolarsError>(())
2444 /// ```
2445 /// Results in:
2446 ///
2447 /// ```text
2448 /// +--------+-------+
2449 /// | foo | |
2450 /// | --- | names |
2451 /// | str | u32 |
2452 /// +========+=======+
2453 /// | "ham" | 4 |
2454 /// +--------+-------+
2455 /// | "spam" | 6 |
2456 /// +--------+-------+
2457 /// | "egg" | 3 |
2458 /// +--------+-------+
2459 /// ```
2460 pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2461 where
2462 F: FnOnce(&Column) -> C,
2463 C: IntoColumn,
2464 {
2465 let idx = self.check_name_to_idx(name)?;
2466 self.apply_at_idx(idx, f)?;
2467 Ok(self)
2468 }
2469
2470 /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2471 /// modification.
2472 ///
2473 /// # Example
2474 ///
2475 /// ```rust
2476 /// # use polars_core::prelude::*;
2477 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2478 /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2479 /// let mut df = DataFrame::new(vec![s0, s1])?;
2480 ///
2481 /// // Add 32 to get lowercase ascii values
2482 /// df.apply_at_idx(1, |s| s + 32);
2483 /// # Ok::<(), PolarsError>(())
2484 /// ```
2485 /// Results in:
2486 ///
2487 /// ```text
2488 /// +--------+-------+
2489 /// | foo | ascii |
2490 /// | --- | --- |
2491 /// | str | i32 |
2492 /// +========+=======+
2493 /// | "ham" | 102 |
2494 /// +--------+-------+
2495 /// | "spam" | 111 |
2496 /// +--------+-------+
2497 /// | "egg" | 111 |
2498 /// +--------+-------+
2499 /// ```
2500 pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2501 where
2502 F: FnOnce(&Column) -> C,
2503 C: IntoColumn,
2504 {
2505 let df_height = self.height();
2506 let width = self.width();
2507 let col = self.columns.get_mut(idx).ok_or_else(|| {
2508 polars_err!(
2509 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2510 idx, width
2511 )
2512 })?;
2513 let name = col.name().clone();
2514 let dtype_before = col.dtype().clone();
2515 let new_col = f(col).into_column();
2516 match new_col.len() {
2517 1 => {
2518 let new_col = new_col.new_from_index(0, df_height);
2519 let _ = mem::replace(col, new_col);
2520 },
2521 len if (len == df_height) => {
2522 let _ = mem::replace(col, new_col);
2523 },
2524 len => polars_bail!(
2525 ShapeMismatch:
2526 "resulting Series has length {} while the DataFrame has height {}",
2527 len, df_height
2528 ),
2529 }
2530
2531 // make sure the name remains the same after applying the closure
2532 unsafe {
2533 let col = self.columns.get_unchecked_mut(idx);
2534 col.rename(name);
2535
2536 if col.dtype() != &dtype_before {
2537 self.clear_schema();
2538 }
2539 }
2540 Ok(self)
2541 }
2542
2543 /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2544 /// modification.
2545 ///
2546 /// # Example
2547 ///
2548 /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2549 ///
2550 /// ```rust
2551 /// # use polars_core::prelude::*;
2552 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2553 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2554 /// let mut df = DataFrame::new(vec![s0, s1])?;
2555 ///
2556 /// let idx = vec![0, 1, 4];
2557 ///
2558 /// df.try_apply("foo", |c| {
2559 /// c.str()?
2560 /// .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2561 /// });
2562 /// # Ok::<(), PolarsError>(())
2563 /// ```
2564 /// Results in:
2565 ///
2566 /// ```text
2567 /// +---------------------+--------+
2568 /// | foo | values |
2569 /// | --- | --- |
2570 /// | str | i32 |
2571 /// +=====================+========+
2572 /// | "ham-is-modified" | 1 |
2573 /// +---------------------+--------+
2574 /// | "spam-is-modified" | 2 |
2575 /// +---------------------+--------+
2576 /// | "egg" | 3 |
2577 /// +---------------------+--------+
2578 /// | "bacon" | 4 |
2579 /// +---------------------+--------+
2580 /// | "quack-is-modified" | 5 |
2581 /// +---------------------+--------+
2582 /// ```
2583 pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2584 where
2585 F: FnOnce(&Column) -> PolarsResult<C>,
2586 C: IntoColumn,
2587 {
2588 let width = self.width();
2589 let col = self.columns.get_mut(idx).ok_or_else(|| {
2590 polars_err!(
2591 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2592 idx, width
2593 )
2594 })?;
2595 let name = col.name().clone();
2596
2597 let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2598
2599 // make sure the name remains the same after applying the closure
2600 unsafe {
2601 let col = self.columns.get_unchecked_mut(idx);
2602 col.rename(name);
2603 }
2604 Ok(self)
2605 }
2606
2607 /// Apply a closure that may fail to a column. This is the recommended way to do in place
2608 /// modification.
2609 ///
2610 /// # Example
2611 ///
2612 /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2613 ///
2614 /// ```rust
2615 /// # use polars_core::prelude::*;
2616 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2617 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2618 /// let mut df = DataFrame::new(vec![s0, s1])?;
2619 ///
2620 /// // create a mask
2621 /// let values = df.column("values")?.as_materialized_series();
2622 /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2623 ///
2624 /// df.try_apply("foo", |c| {
2625 /// c.str()?
2626 /// .set(&mask, Some("not_within_bounds"))
2627 /// });
2628 /// # Ok::<(), PolarsError>(())
2629 /// ```
2630 /// Results in:
2631 ///
2632 /// ```text
2633 /// +---------------------+--------+
2634 /// | foo | values |
2635 /// | --- | --- |
2636 /// | str | i32 |
2637 /// +=====================+========+
2638 /// | "not_within_bounds" | 1 |
2639 /// +---------------------+--------+
2640 /// | "spam" | 2 |
2641 /// +---------------------+--------+
2642 /// | "egg" | 3 |
2643 /// +---------------------+--------+
2644 /// | "bacon" | 4 |
2645 /// +---------------------+--------+
2646 /// | "not_within_bounds" | 5 |
2647 /// +---------------------+--------+
2648 /// ```
2649 pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2650 where
2651 F: FnOnce(&Series) -> PolarsResult<C>,
2652 C: IntoColumn,
2653 {
2654 let idx = self.try_get_column_index(column)?;
2655 self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2656 }
2657
2658 /// Slice the [`DataFrame`] along the rows.
2659 ///
2660 /// # Example
2661 ///
2662 /// ```rust
2663 /// # use polars_core::prelude::*;
2664 /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2665 /// "Color" => ["Green", "Red", "White", "White", "Red"])?;
2666 /// let sl: DataFrame = df.slice(2, 3);
2667 ///
2668 /// assert_eq!(sl.shape(), (3, 2));
2669 /// println!("{}", sl);
2670 /// # Ok::<(), PolarsError>(())
2671 /// ```
2672 /// Output:
2673 /// ```text
2674 /// shape: (3, 2)
2675 /// +-------+-------+
2676 /// | Fruit | Color |
2677 /// | --- | --- |
2678 /// | str | str |
2679 /// +=======+=======+
2680 /// | Grape | White |
2681 /// +-------+-------+
2682 /// | Fig | White |
2683 /// +-------+-------+
2684 /// | Fig | Red |
2685 /// +-------+-------+
2686 /// ```
2687 #[must_use]
2688 pub fn slice(&self, offset: i64, length: usize) -> Self {
2689 if offset == 0 && length == self.height() {
2690 return self.clone();
2691 }
2692 if length == 0 {
2693 return self.clear();
2694 }
2695 let col = self
2696 .columns
2697 .iter()
2698 .map(|s| s.slice(offset, length))
2699 .collect::<Vec<_>>();
2700
2701 let height = if let Some(fst) = col.first() {
2702 fst.len()
2703 } else {
2704 let (_, length) = slice_offsets(offset, length, self.height());
2705 length
2706 };
2707
2708 unsafe { DataFrame::new_no_checks(height, col) }
2709 }
2710
2711 /// Split [`DataFrame`] at the given `offset`.
2712 pub fn split_at(&self, offset: i64) -> (Self, Self) {
2713 let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2714
2715 let (idx, _) = slice_offsets(offset, 0, self.height());
2716
2717 let a = unsafe { DataFrame::new_no_checks(idx, a) };
2718 let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2719 (a, b)
2720 }
2721
2722 pub fn clear(&self) -> Self {
2723 let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2724 unsafe { DataFrame::new_no_checks(0, col) }
2725 }
2726
2727 #[must_use]
2728 pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2729 if offset == 0 && length == self.height() {
2730 return self.clone();
2731 }
2732 let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2733 unsafe { DataFrame::new_no_checks(length, columns) }
2734 }
2735
2736 #[must_use]
2737 pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2738 if offset == 0 && length == self.height() {
2739 return self.clone();
2740 }
2741 // @scalar-opt
2742 let columns = self._apply_columns(&|s| {
2743 let mut out = s.slice(offset, length);
2744 out.shrink_to_fit();
2745 out
2746 });
2747 unsafe { DataFrame::new_no_checks(length, columns) }
2748 }
2749
2750 /// Get the head of the [`DataFrame`].
2751 ///
2752 /// # Example
2753 ///
2754 /// ```rust
2755 /// # use polars_core::prelude::*;
2756 /// let countries: DataFrame =
2757 /// df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2758 /// "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2759 /// "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2760 /// "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2761 /// assert_eq!(countries.shape(), (5, 4));
2762 ///
2763 /// println!("{}", countries.head(Some(3)));
2764 /// # Ok::<(), PolarsError>(())
2765 /// ```
2766 ///
2767 /// Output:
2768 ///
2769 /// ```text
2770 /// shape: (3, 4)
2771 /// +--------------------+---------------+---------------+------------+
2772 /// | Rank by GDP (2021) | Continent | Country | Capital |
2773 /// | --- | --- | --- | --- |
2774 /// | i32 | str | str | str |
2775 /// +====================+===============+===============+============+
2776 /// | 1 | North America | United States | Washington |
2777 /// +--------------------+---------------+---------------+------------+
2778 /// | 2 | Asia | China | Beijing |
2779 /// +--------------------+---------------+---------------+------------+
2780 /// | 3 | Asia | Japan | Tokyo |
2781 /// +--------------------+---------------+---------------+------------+
2782 /// ```
2783 #[must_use]
2784 pub fn head(&self, length: Option<usize>) -> Self {
2785 let col = self
2786 .columns
2787 .iter()
2788 .map(|c| c.head(length))
2789 .collect::<Vec<_>>();
2790
2791 let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2792 let height = usize::min(height, self.height());
2793 unsafe { DataFrame::new_no_checks(height, col) }
2794 }
2795
2796 /// Get the tail of the [`DataFrame`].
2797 ///
2798 /// # Example
2799 ///
2800 /// ```rust
2801 /// # use polars_core::prelude::*;
2802 /// let countries: DataFrame =
2803 /// df!("Rank (2021)" => [105, 106, 107, 108, 109],
2804 /// "Apple Price (ā¬/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2805 /// "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2806 /// assert_eq!(countries.shape(), (5, 3));
2807 ///
2808 /// println!("{}", countries.tail(Some(2)));
2809 /// # Ok::<(), PolarsError>(())
2810 /// ```
2811 ///
2812 /// Output:
2813 ///
2814 /// ```text
2815 /// shape: (2, 3)
2816 /// +-------------+--------------------+---------+
2817 /// | Rank (2021) | Apple Price (ā¬/kg) | Country |
2818 /// | --- | --- | --- |
2819 /// | i32 | f64 | str |
2820 /// +=============+====================+=========+
2821 /// | 108 | 0.63 | Syria |
2822 /// +-------------+--------------------+---------+
2823 /// | 109 | 0.63 | Turkey |
2824 /// +-------------+--------------------+---------+
2825 /// ```
2826 #[must_use]
2827 pub fn tail(&self, length: Option<usize>) -> Self {
2828 let col = self
2829 .columns
2830 .iter()
2831 .map(|c| c.tail(length))
2832 .collect::<Vec<_>>();
2833
2834 let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2835 let height = usize::min(height, self.height());
2836 unsafe { DataFrame::new_no_checks(height, col) }
2837 }
2838
2839 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2840 ///
2841 /// # Panics
2842 ///
2843 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2844 ///
2845 /// This responsibility is left to the caller as we don't want to take mutable references here,
2846 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2847 /// as well.
2848 pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter<'_> {
2849 debug_assert!(!self.should_rechunk(), "expected equal chunks");
2850 // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2851 // as we must allocate arrow strings/binaries.
2852 let must_convert = compat_level.0 == 0;
2853 let parallel = parallel
2854 && must_convert
2855 && self.columns.len() > 1
2856 && self
2857 .columns
2858 .iter()
2859 .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2860
2861 RecordBatchIter {
2862 columns: &self.columns,
2863 schema: Arc::new(
2864 self.columns
2865 .iter()
2866 .map(|c| c.field().to_arrow(compat_level))
2867 .collect(),
2868 ),
2869 idx: 0,
2870 n_chunks: self.first_col_n_chunks(),
2871 compat_level,
2872 parallel,
2873 }
2874 }
2875
2876 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2877 ///
2878 /// # Panics
2879 ///
2880 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2881 ///
2882 /// This responsibility is left to the caller as we don't want to take mutable references here,
2883 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2884 /// as well.
2885 pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2886 debug_assert!(!self.should_rechunk());
2887 PhysRecordBatchIter {
2888 schema: Arc::new(
2889 self.get_columns()
2890 .iter()
2891 .map(|c| c.field().to_arrow(CompatLevel::newest()))
2892 .collect(),
2893 ),
2894 arr_iters: self
2895 .materialized_column_iter()
2896 .map(|s| s.chunks().iter())
2897 .collect(),
2898 }
2899 }
2900
2901 /// Get a [`DataFrame`] with all the columns in reversed order.
2902 #[must_use]
2903 pub fn reverse(&self) -> Self {
2904 let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2905 unsafe { DataFrame::new_no_checks(self.height(), col) }
2906 }
2907
2908 /// Shift the values by a given period and fill the parts that will be empty due to this operation
2909 /// with `Nones`.
2910 ///
2911 /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2912 #[must_use]
2913 pub fn shift(&self, periods: i64) -> Self {
2914 let col = self._apply_columns_par(&|s| s.shift(periods));
2915 unsafe { DataFrame::new_no_checks(self.height(), col) }
2916 }
2917
2918 /// Replace None values with one of the following strategies:
2919 /// * Forward fill (replace None with the previous value)
2920 /// * Backward fill (replace None with the next value)
2921 /// * Mean fill (replace None with the mean of the whole array)
2922 /// * Min fill (replace None with the minimum of the whole array)
2923 /// * Max fill (replace None with the maximum of the whole array)
2924 ///
2925 /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2926 pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2927 let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2928
2929 Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2930 }
2931
2932 /// Pipe different functions/ closure operations that work on a DataFrame together.
2933 pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2934 where
2935 F: Fn(DataFrame) -> PolarsResult<B>,
2936 {
2937 f(self)
2938 }
2939
2940 /// Pipe different functions/ closure operations that work on a DataFrame together.
2941 pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2942 where
2943 F: Fn(&mut DataFrame) -> PolarsResult<B>,
2944 {
2945 f(self)
2946 }
2947
2948 /// Pipe different functions/ closure operations that work on a DataFrame together.
2949 pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2950 where
2951 F: Fn(DataFrame, Args) -> PolarsResult<B>,
2952 {
2953 f(self, args)
2954 }
2955
2956 /// Drop duplicate rows from a [`DataFrame`].
2957 /// *This fails when there is a column of type List in DataFrame*
2958 ///
2959 /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2960 ///
2961 /// # Example
2962 ///
2963 /// ```no_run
2964 /// # use polars_core::prelude::*;
2965 /// let df = df! {
2966 /// "flt" => [1., 1., 2., 2., 3., 3.],
2967 /// "int" => [1, 1, 2, 2, 3, 3, ],
2968 /// "str" => ["a", "a", "b", "b", "c", "c"]
2969 /// }?;
2970 ///
2971 /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2972 /// # Ok::<(), PolarsError>(())
2973 /// ```
2974 /// Returns
2975 ///
2976 /// ```text
2977 /// +-----+-----+-----+
2978 /// | flt | int | str |
2979 /// | --- | --- | --- |
2980 /// | f64 | i32 | str |
2981 /// +=====+=====+=====+
2982 /// | 1 | 1 | "a" |
2983 /// +-----+-----+-----+
2984 /// | 2 | 2 | "b" |
2985 /// +-----+-----+-----+
2986 /// | 3 | 3 | "c" |
2987 /// +-----+-----+-----+
2988 /// ```
2989 #[cfg(feature = "algorithm_group_by")]
2990 pub fn unique_stable(
2991 &self,
2992 subset: Option<&[String]>,
2993 keep: UniqueKeepStrategy,
2994 slice: Option<(i64, usize)>,
2995 ) -> PolarsResult<DataFrame> {
2996 self.unique_impl(
2997 true,
2998 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2999 keep,
3000 slice,
3001 )
3002 }
3003
3004 /// Unstable distinct. See [`DataFrame::unique_stable`].
3005 #[cfg(feature = "algorithm_group_by")]
3006 pub fn unique<I, S>(
3007 &self,
3008 subset: Option<&[String]>,
3009 keep: UniqueKeepStrategy,
3010 slice: Option<(i64, usize)>,
3011 ) -> PolarsResult<DataFrame> {
3012 self.unique_impl(
3013 false,
3014 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3015 keep,
3016 slice,
3017 )
3018 }
3019
3020 #[cfg(feature = "algorithm_group_by")]
3021 pub fn unique_impl(
3022 &self,
3023 maintain_order: bool,
3024 subset: Option<Vec<PlSmallStr>>,
3025 keep: UniqueKeepStrategy,
3026 slice: Option<(i64, usize)>,
3027 ) -> PolarsResult<Self> {
3028 let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3029 let mut df = self.clone();
3030 // take on multiple chunks is terrible
3031 df.as_single_chunk_par();
3032
3033 let columns = match (keep, maintain_order) {
3034 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3035 let gb = df.group_by_stable(names)?;
3036 let groups = gb.get_groups();
3037 let (offset, len) = slice.unwrap_or((0, groups.len()));
3038 let groups = groups.slice(offset, len);
3039 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3040 },
3041 (UniqueKeepStrategy::Last, true) => {
3042 // maintain order by last values, so the sorted groups are not correct as they
3043 // are sorted by the first value
3044 let gb = df.group_by_stable(names)?;
3045 let groups = gb.get_groups();
3046
3047 let last_idx: NoNull<IdxCa> = groups
3048 .iter()
3049 .map(|g| match g {
3050 GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3051 GroupsIndicator::Slice([first, len]) => first + len - 1,
3052 })
3053 .collect();
3054
3055 let mut last_idx = last_idx.into_inner().sort(false);
3056
3057 if let Some((offset, len)) = slice {
3058 last_idx = last_idx.slice(offset, len);
3059 }
3060
3061 let last_idx = NoNull::new(last_idx);
3062 let out = unsafe { df.take_unchecked(&last_idx) };
3063 return Ok(out);
3064 },
3065 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3066 let gb = df.group_by(names)?;
3067 let groups = gb.get_groups();
3068 let (offset, len) = slice.unwrap_or((0, groups.len()));
3069 let groups = groups.slice(offset, len);
3070 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3071 },
3072 (UniqueKeepStrategy::Last, false) => {
3073 let gb = df.group_by(names)?;
3074 let groups = gb.get_groups();
3075 let (offset, len) = slice.unwrap_or((0, groups.len()));
3076 let groups = groups.slice(offset, len);
3077 df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3078 },
3079 (UniqueKeepStrategy::None, _) => {
3080 let df_part = df.select(names)?;
3081 let mask = df_part.is_unique()?;
3082 let mut filtered = df.filter(&mask)?;
3083
3084 if let Some((offset, len)) = slice {
3085 filtered = filtered.slice(offset, len);
3086 }
3087 return Ok(filtered);
3088 },
3089 };
3090 let height = Self::infer_height(&columns);
3091 Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3092 }
3093
3094 /// Get a mask of all the unique rows in the [`DataFrame`].
3095 ///
3096 /// # Example
3097 ///
3098 /// ```no_run
3099 /// # use polars_core::prelude::*;
3100 /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3101 /// "ISIN" => ["US0378331005", "US5949181045"])?;
3102 /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3103 ///
3104 /// assert!(ca.all());
3105 /// # Ok::<(), PolarsError>(())
3106 /// ```
3107 #[cfg(feature = "algorithm_group_by")]
3108 pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3109 let gb = self.group_by(self.get_column_names_owned())?;
3110 let groups = gb.get_groups();
3111 Ok(is_unique_helper(
3112 groups,
3113 self.height() as IdxSize,
3114 true,
3115 false,
3116 ))
3117 }
3118
3119 /// Get a mask of all the duplicated rows in the [`DataFrame`].
3120 ///
3121 /// # Example
3122 ///
3123 /// ```no_run
3124 /// # use polars_core::prelude::*;
3125 /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3126 /// "ISIN" => ["US02079K3059", "US02079K1079"])?;
3127 /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3128 ///
3129 /// assert!(!ca.all());
3130 /// # Ok::<(), PolarsError>(())
3131 /// ```
3132 #[cfg(feature = "algorithm_group_by")]
3133 pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3134 let gb = self.group_by(self.get_column_names_owned())?;
3135 let groups = gb.get_groups();
3136 Ok(is_unique_helper(
3137 groups,
3138 self.height() as IdxSize,
3139 false,
3140 true,
3141 ))
3142 }
3143
3144 /// Create a new [`DataFrame`] that shows the null counts per column.
3145 #[must_use]
3146 pub fn null_count(&self) -> Self {
3147 let cols = self
3148 .columns
3149 .iter()
3150 .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3151 .collect();
3152 unsafe { Self::new_no_checks(1, cols) }
3153 }
3154
3155 /// Hash and combine the row values
3156 #[cfg(feature = "row_hash")]
3157 pub fn hash_rows(
3158 &mut self,
3159 hasher_builder: Option<PlSeedableRandomStateQuality>,
3160 ) -> PolarsResult<UInt64Chunked> {
3161 let dfs = split_df(self, POOL.current_num_threads(), false);
3162 let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3163
3164 let mut iter = cas.into_iter();
3165 let mut acc_ca = iter.next().unwrap();
3166 for ca in iter {
3167 acc_ca.append(&ca)?;
3168 }
3169 Ok(acc_ca.rechunk().into_owned())
3170 }
3171
3172 /// Get the supertype of the columns in this DataFrame
3173 pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3174 self.columns
3175 .iter()
3176 .map(|s| Ok(s.dtype().clone()))
3177 .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3178 }
3179
3180 /// Take by index values given by the slice `idx`.
3181 /// # Warning
3182 /// Be careful with allowing threads when calling this in a large hot loop
3183 /// every thread split may be on rayon stack and lead to SO
3184 #[doc(hidden)]
3185 pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3186 self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3187 }
3188
3189 /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3190 /// if the index value in `idx` are sorted. This will maintain sorted flags.
3191 ///
3192 /// # Warning
3193 /// Be careful with allowing threads when calling this in a large hot loop
3194 /// every thread split may be on rayon stack and lead to SO
3195 #[doc(hidden)]
3196 pub unsafe fn _take_unchecked_slice_sorted(
3197 &self,
3198 idx: &[IdxSize],
3199 allow_threads: bool,
3200 sorted: IsSorted,
3201 ) -> Self {
3202 #[cfg(debug_assertions)]
3203 {
3204 if idx.len() > 2 {
3205 match sorted {
3206 IsSorted::Ascending => {
3207 assert!(idx[0] <= idx[idx.len() - 1]);
3208 },
3209 IsSorted::Descending => {
3210 assert!(idx[0] >= idx[idx.len() - 1]);
3211 },
3212 _ => {},
3213 }
3214 }
3215 }
3216 let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3217 ca.set_sorted_flag(sorted);
3218 self.take_unchecked_impl(&ca, allow_threads)
3219 }
3220
3221 #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3222 #[doc(hidden)]
3223 pub fn _partition_by_impl(
3224 &self,
3225 cols: &[PlSmallStr],
3226 stable: bool,
3227 include_key: bool,
3228 parallel: bool,
3229 ) -> PolarsResult<Vec<DataFrame>> {
3230 let selected_keys = self.select_columns(cols.iter().cloned())?;
3231 let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3232 let groups = groups.take_groups();
3233
3234 // drop key columns prior to calculation if requested
3235 let df = if include_key {
3236 self.clone()
3237 } else {
3238 self.drop_many(cols.iter().cloned())
3239 };
3240
3241 if parallel {
3242 // don't parallelize this
3243 // there is a lot of parallelization in take and this may easily SO
3244 POOL.install(|| {
3245 match groups.as_ref() {
3246 GroupsType::Idx(idx) => {
3247 // Rechunk as the gather may rechunk for every group #17562.
3248 let mut df = df.clone();
3249 df.as_single_chunk_par();
3250 Ok(idx
3251 .into_par_iter()
3252 .map(|(_, group)| {
3253 // groups are in bounds
3254 unsafe {
3255 df._take_unchecked_slice_sorted(
3256 group,
3257 false,
3258 IsSorted::Ascending,
3259 )
3260 }
3261 })
3262 .collect())
3263 },
3264 GroupsType::Slice { groups, .. } => Ok(groups
3265 .into_par_iter()
3266 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3267 .collect()),
3268 }
3269 })
3270 } else {
3271 match groups.as_ref() {
3272 GroupsType::Idx(idx) => {
3273 // Rechunk as the gather may rechunk for every group #17562.
3274 let mut df = df;
3275 df.as_single_chunk();
3276 Ok(idx
3277 .into_iter()
3278 .map(|(_, group)| {
3279 // groups are in bounds
3280 unsafe {
3281 df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3282 }
3283 })
3284 .collect())
3285 },
3286 GroupsType::Slice { groups, .. } => Ok(groups
3287 .iter()
3288 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3289 .collect()),
3290 }
3291 }
3292 }
3293
3294 /// Split into multiple DataFrames partitioned by groups
3295 #[cfg(feature = "partition_by")]
3296 pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3297 where
3298 I: IntoIterator<Item = S>,
3299 S: Into<PlSmallStr>,
3300 {
3301 let cols = cols
3302 .into_iter()
3303 .map(Into::into)
3304 .collect::<Vec<PlSmallStr>>();
3305 self._partition_by_impl(cols.as_slice(), false, include_key, true)
3306 }
3307
3308 /// Split into multiple DataFrames partitioned by groups
3309 /// Order of the groups are maintained.
3310 #[cfg(feature = "partition_by")]
3311 pub fn partition_by_stable<I, S>(
3312 &self,
3313 cols: I,
3314 include_key: bool,
3315 ) -> PolarsResult<Vec<DataFrame>>
3316 where
3317 I: IntoIterator<Item = S>,
3318 S: Into<PlSmallStr>,
3319 {
3320 let cols = cols
3321 .into_iter()
3322 .map(Into::into)
3323 .collect::<Vec<PlSmallStr>>();
3324 self._partition_by_impl(cols.as_slice(), true, include_key, true)
3325 }
3326
3327 /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3328 /// inserted as columns.
3329 #[cfg(feature = "dtype-struct")]
3330 pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {
3331 let cols = cols.into_vec();
3332 self.unnest_impl(cols.into_iter().collect())
3333 }
3334
3335 #[cfg(feature = "dtype-struct")]
3336 fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {
3337 let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3338 let mut count = 0;
3339 for s in &self.columns {
3340 if cols.contains(s.name()) {
3341 let ca = s.struct_()?.clone();
3342 new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));
3343 count += 1;
3344 } else {
3345 new_cols.push(s.clone())
3346 }
3347 }
3348 if count != cols.len() {
3349 // one or more columns not found
3350 // the code below will return an error with the missing name
3351 let schema = self.schema();
3352 for col in cols {
3353 let _ = schema
3354 .get(col.as_str())
3355 .ok_or_else(|| polars_err!(col_not_found = col))?;
3356 }
3357 }
3358 DataFrame::new(new_cols)
3359 }
3360
3361 pub(crate) fn infer_height(cols: &[Column]) -> usize {
3362 cols.first().map_or(0, Column::len)
3363 }
3364
3365 pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3366 // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3367 // append_chunk or something like this. It is just quite difficult to make that safe.
3368 let df = DataFrame::from(rb);
3369 polars_ensure!(
3370 self.schema() == df.schema(),
3371 SchemaMismatch: "cannot append record batch with different schema\n\n
3372 Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
3373 );
3374 self.vstack_mut_owned_unchecked(df);
3375 Ok(())
3376 }
3377}
3378
3379pub struct RecordBatchIter<'a> {
3380 columns: &'a Vec<Column>,
3381 schema: ArrowSchemaRef,
3382 idx: usize,
3383 n_chunks: usize,
3384 compat_level: CompatLevel,
3385 parallel: bool,
3386}
3387
3388impl Iterator for RecordBatchIter<'_> {
3389 type Item = RecordBatch;
3390
3391 fn next(&mut self) -> Option<Self::Item> {
3392 if self.idx >= self.n_chunks {
3393 return None;
3394 }
3395
3396 // Create a batch of the columns with the same chunk no.
3397 let batch_cols: Vec<ArrayRef> = if self.parallel {
3398 let iter = self
3399 .columns
3400 .par_iter()
3401 .map(Column::as_materialized_series)
3402 .map(|s| s.to_arrow(self.idx, self.compat_level));
3403 POOL.install(|| iter.collect())
3404 } else {
3405 self.columns
3406 .iter()
3407 .map(Column::as_materialized_series)
3408 .map(|s| s.to_arrow(self.idx, self.compat_level))
3409 .collect()
3410 };
3411 self.idx += 1;
3412
3413 let length = batch_cols.first().map_or(0, |arr| arr.len());
3414 Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3415 }
3416
3417 fn size_hint(&self) -> (usize, Option<usize>) {
3418 let n = self.n_chunks - self.idx;
3419 (n, Some(n))
3420 }
3421}
3422
3423pub struct PhysRecordBatchIter<'a> {
3424 schema: ArrowSchemaRef,
3425 arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3426}
3427
3428impl Iterator for PhysRecordBatchIter<'_> {
3429 type Item = RecordBatch;
3430
3431 fn next(&mut self) -> Option<Self::Item> {
3432 let arrs = self
3433 .arr_iters
3434 .iter_mut()
3435 .map(|phys_iter| phys_iter.next().cloned())
3436 .collect::<Option<Vec<_>>>()?;
3437
3438 let length = arrs.first().map_or(0, |arr| arr.len());
3439 Some(RecordBatch::new(length, self.schema.clone(), arrs))
3440 }
3441
3442 fn size_hint(&self) -> (usize, Option<usize>) {
3443 if let Some(iter) = self.arr_iters.first() {
3444 iter.size_hint()
3445 } else {
3446 (0, None)
3447 }
3448 }
3449}
3450
3451impl Default for DataFrame {
3452 fn default() -> Self {
3453 DataFrame::empty()
3454 }
3455}
3456
3457impl From<DataFrame> for Vec<Column> {
3458 fn from(df: DataFrame) -> Self {
3459 df.columns
3460 }
3461}
3462
3463// utility to test if we can vstack/extend the columns
3464fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3465 polars_ensure!(
3466 left.name() == right.name(),
3467 ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3468 left.name(), right.name(),
3469 );
3470 Ok(())
3471}
3472
3473#[cfg(test)]
3474mod test {
3475 use super::*;
3476
3477 fn create_frame() -> DataFrame {
3478 let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3479 let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3480 DataFrame::new(vec![s0, s1]).unwrap()
3481 }
3482
3483 #[test]
3484 #[cfg_attr(miri, ignore)]
3485 fn test_recordbatch_iterator() {
3486 let df = df!(
3487 "foo" => [1, 2, 3, 4, 5]
3488 )
3489 .unwrap();
3490 let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3491 assert_eq!(5, iter.next().unwrap().len());
3492 assert!(iter.next().is_none());
3493 }
3494
3495 #[test]
3496 #[cfg_attr(miri, ignore)]
3497 fn test_select() {
3498 let df = create_frame();
3499 assert_eq!(
3500 df.column("days")
3501 .unwrap()
3502 .as_series()
3503 .unwrap()
3504 .equal(1)
3505 .unwrap()
3506 .sum(),
3507 Some(1)
3508 );
3509 }
3510
3511 #[test]
3512 #[cfg_attr(miri, ignore)]
3513 fn test_filter_broadcast_on_string_col() {
3514 let col_name = "some_col";
3515 let v = vec!["test".to_string()];
3516 let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3517 let mut df = DataFrame::new(vec![s0]).unwrap();
3518
3519 df = df
3520 .filter(
3521 &df.column(col_name)
3522 .unwrap()
3523 .as_materialized_series()
3524 .equal("")
3525 .unwrap(),
3526 )
3527 .unwrap();
3528 assert_eq!(
3529 df.column(col_name)
3530 .unwrap()
3531 .as_materialized_series()
3532 .n_chunks(),
3533 1
3534 );
3535 }
3536
3537 #[test]
3538 #[cfg_attr(miri, ignore)]
3539 fn test_filter_broadcast_on_list_col() {
3540 let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3541 let ll: ListChunked = [&s1].iter().copied().collect();
3542
3543 let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3544 let new = ll.filter(&mask).unwrap();
3545
3546 assert_eq!(new.chunks.len(), 1);
3547 assert_eq!(new.len(), 0);
3548 }
3549
3550 #[test]
3551 fn slice() {
3552 let df = create_frame();
3553 let sliced_df = df.slice(0, 2);
3554 assert_eq!(sliced_df.shape(), (2, 2));
3555 }
3556
3557 #[test]
3558 fn rechunk_false() {
3559 let df = create_frame();
3560 assert!(!df.should_rechunk())
3561 }
3562
3563 #[test]
3564 fn rechunk_true() -> PolarsResult<()> {
3565 let mut base = df!(
3566 "a" => [1, 2, 3],
3567 "b" => [1, 2, 3]
3568 )?;
3569
3570 // Create a series with multiple chunks
3571 let mut s = Series::new("foo".into(), 0..2);
3572 let s2 = Series::new("bar".into(), 0..1);
3573 s.append(&s2)?;
3574
3575 // Append series to frame
3576 let out = base.with_column(s)?;
3577
3578 // Now we should rechunk
3579 assert!(out.should_rechunk());
3580 Ok(())
3581 }
3582
3583 #[test]
3584 fn test_duplicate_column() {
3585 let mut df = df! {
3586 "foo" => [1, 2, 3]
3587 }
3588 .unwrap();
3589 // check if column is replaced
3590 assert!(
3591 df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3592 .is_ok()
3593 );
3594 assert!(
3595 df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3596 .is_ok()
3597 );
3598 assert!(df.column("bar").is_ok())
3599 }
3600
3601 #[test]
3602 #[cfg_attr(miri, ignore)]
3603 fn distinct() {
3604 let df = df! {
3605 "flt" => [1., 1., 2., 2., 3., 3.],
3606 "int" => [1, 1, 2, 2, 3, 3, ],
3607 "str" => ["a", "a", "b", "b", "c", "c"]
3608 }
3609 .unwrap();
3610 let df = df
3611 .unique_stable(None, UniqueKeepStrategy::First, None)
3612 .unwrap()
3613 .sort(["flt"], SortMultipleOptions::default())
3614 .unwrap();
3615 let valid = df! {
3616 "flt" => [1., 2., 3.],
3617 "int" => [1, 2, 3],
3618 "str" => ["a", "b", "c"]
3619 }
3620 .unwrap();
3621 assert!(df.equals(&valid));
3622 }
3623
3624 #[test]
3625 fn test_vstack() {
3626 // check that it does not accidentally rechunks
3627 let mut df = df! {
3628 "flt" => [1., 1., 2., 2., 3., 3.],
3629 "int" => [1, 1, 2, 2, 3, 3, ],
3630 "str" => ["a", "a", "b", "b", "c", "c"]
3631 }
3632 .unwrap();
3633
3634 df.vstack_mut(&df.slice(0, 3)).unwrap();
3635 assert_eq!(df.first_col_n_chunks(), 2)
3636 }
3637
3638 #[test]
3639 fn test_vstack_on_empty_dataframe() {
3640 let mut df = DataFrame::empty();
3641
3642 let df_data = df! {
3643 "flt" => [1., 1., 2., 2., 3., 3.],
3644 "int" => [1, 1, 2, 2, 3, 3, ],
3645 "str" => ["a", "a", "b", "b", "c", "c"]
3646 }
3647 .unwrap();
3648
3649 df.vstack_mut(&df_data).unwrap();
3650 assert_eq!(df.height, 6)
3651 }
3652
3653 #[test]
3654 fn test_replace_or_add() -> PolarsResult<()> {
3655 let mut df = df!(
3656 "a" => [1, 2, 3],
3657 "b" => [1, 2, 3]
3658 )?;
3659
3660 // check that the new column is "c" and not "bar".
3661 df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3662
3663 assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3664 Ok(())
3665 }
3666
3667 #[test]
3668 fn test_unique_keep_none_with_slice() {
3669 let df = df! {
3670 "x" => [1, 2, 3, 2, 1]
3671 }
3672 .unwrap();
3673 let out = df
3674 .unique_stable(
3675 Some(&["x".to_string()][..]),
3676 UniqueKeepStrategy::None,
3677 Some((0, 2)),
3678 )
3679 .unwrap();
3680 let expected = df! {
3681 "x" => [3]
3682 }
3683 .unwrap();
3684 assert!(out.equals(&expected));
3685 }
3686
3687 #[test]
3688 #[cfg(feature = "dtype-i8")]
3689 fn test_apply_result_schema() {
3690 let mut df = df! {
3691 "x" => [1, 2, 3, 2, 1]
3692 }
3693 .unwrap();
3694
3695 let schema_before = df.schema().clone();
3696 df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();
3697 assert_ne!(&schema_before, df.schema());
3698 }
3699}