polars_core/frame/
mod.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(any(feature = "rows", feature = "object"))]
33pub mod row;
34mod top_k;
35mod upstream_traits;
36mod validation;
37
38use arrow::record_batch::{RecordBatch, RecordBatchT};
39use polars_utils::pl_str::PlSmallStr;
40#[cfg(feature = "serde")]
41use serde::{Deserialize, Serialize};
42use strum_macros::IntoStaticStr;
43
44use crate::POOL;
45#[cfg(feature = "row_hash")]
46use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48use crate::series::IsSorted;
49
50#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
53#[strum(serialize_all = "snake_case")]
54pub enum UniqueKeepStrategy {
55    /// Keep the first unique row.
56    First,
57    /// Keep the last unique row.
58    Last,
59    /// Keep None of the unique rows.
60    None,
61    /// Keep any of the unique rows
62    /// This allows more optimizations
63    #[default]
64    Any,
65}
66
67fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
68where
69    F: for<'a> FnMut(&'a T) -> &'a str,
70{
71    // Always unique.
72    if items.len() <= 1 {
73        return Ok(());
74    }
75
76    if items.len() <= 4 {
77        // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
78        for i in 0..items.len() - 1 {
79            let name = get_name(&items[i]);
80            for other in items.iter().skip(i + 1) {
81                if name == get_name(other) {
82                    polars_bail!(duplicate = name);
83                }
84            }
85        }
86    } else {
87        let mut names = PlHashSet::with_capacity(items.len());
88        for item in items {
89            let name = get_name(item);
90            if !names.insert(name) {
91                polars_bail!(duplicate = name);
92            }
93        }
94    }
95    Ok(())
96}
97
98/// A contiguous growable collection of `Series` that have the same length.
99///
100/// ## Use declarations
101///
102/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
103///
104/// ```rust
105/// use polars_core::prelude::*; // if the crate polars-core is used directly
106/// // use polars::prelude::*;      if the crate polars is used
107/// ```
108///
109/// # Initialization
110/// ## Default
111///
112/// A `DataFrame` can be initialized empty:
113///
114/// ```rust
115/// # use polars_core::prelude::*;
116/// let df = DataFrame::default();
117/// assert!(df.is_empty());
118/// ```
119///
120/// ## Wrapping a `Vec<Series>`
121///
122/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
123///
124/// ```rust
125/// # use polars_core::prelude::*;
126/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
127/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
128///
129/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
130/// ```
131///
132/// ## Using a macro
133///
134/// The [`df!`] macro is a convenient method:
135///
136/// ```rust
137/// # use polars_core::prelude::*;
138/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
139///                                       "Color" => ["Red", "Yellow", "Green"]);
140/// ```
141///
142/// ## Using a CSV file
143///
144/// See the `polars_io::csv::CsvReader`.
145///
146/// # Indexing
147/// ## By a number
148///
149/// The `Index<usize>` is implemented for the `DataFrame`.
150///
151/// ```rust
152/// # use polars_core::prelude::*;
153/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
154///              "Color" => ["Red", "Yellow", "Green"])?;
155///
156/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
157/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
158/// # Ok::<(), PolarsError>(())
159/// ```
160///
161/// ## By a `Series` name
162///
163/// ```rust
164/// # use polars_core::prelude::*;
165/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
166///              "Color" => ["Red", "Yellow", "Green"])?;
167///
168/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
169/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
170/// # Ok::<(), PolarsError>(())
171/// ```
172#[derive(Clone)]
173pub struct DataFrame {
174    height: usize,
175    // invariant: columns[i].len() == height for each 0 >= i > columns.len()
176    pub(crate) columns: Vec<Column>,
177
178    /// A cached schema. This might not give correct results if the DataFrame was modified in place
179    /// between schema and reading.
180    cached_schema: OnceLock<SchemaRef>,
181}
182
183impl DataFrame {
184    pub fn clear_schema(&mut self) {
185        self.cached_schema = OnceLock::new();
186    }
187
188    #[inline]
189    pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
190        self.columns.iter()
191    }
192
193    #[inline]
194    pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
195        self.columns.iter().map(Column::as_materialized_series)
196    }
197
198    #[inline]
199    pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
200        self.columns.par_iter().map(Column::as_materialized_series)
201    }
202
203    /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
204    ///
205    /// # Implementation
206    /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
207    /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
208    /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
209    ///
210    /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
211    /// However, this function will yield a smaller number. This is because this function returns
212    /// the visible size of the buffer, not its total capacity.
213    ///
214    /// FFI buffers are included in this estimation.
215    pub fn estimated_size(&self) -> usize {
216        self.columns.iter().map(Column::estimated_size).sum()
217    }
218
219    // Reduce monomorphization.
220    fn try_apply_columns(
221        &self,
222        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
223    ) -> PolarsResult<Vec<Column>> {
224        self.columns.iter().map(func).collect()
225    }
226    // Reduce monomorphization.
227    pub fn _apply_columns(&self, func: &(dyn Fn(&Column) -> Column)) -> Vec<Column> {
228        self.columns.iter().map(func).collect()
229    }
230    // Reduce monomorphization.
231    fn try_apply_columns_par(
232        &self,
233        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
234    ) -> PolarsResult<Vec<Column>> {
235        POOL.install(|| self.columns.par_iter().map(func).collect())
236    }
237    // Reduce monomorphization.
238    pub fn _apply_columns_par(
239        &self,
240        func: &(dyn Fn(&Column) -> Column + Send + Sync),
241    ) -> Vec<Column> {
242        POOL.install(|| self.columns.par_iter().map(func).collect())
243    }
244
245    /// Get the index of the column.
246    fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
247        self.get_column_index(name)
248            .ok_or_else(|| polars_err!(col_not_found = name))
249    }
250
251    fn check_already_present(&self, name: &str) -> PolarsResult<()> {
252        polars_ensure!(
253            self.columns.iter().all(|s| s.name().as_str() != name),
254            Duplicate: "column with name {:?} is already present in the DataFrame", name
255        );
256        Ok(())
257    }
258
259    /// Reserve additional slots into the chunks of the series.
260    pub(crate) fn reserve_chunks(&mut self, additional: usize) {
261        for s in &mut self.columns {
262            if let Column::Series(s) = s {
263                // SAFETY:
264                // do not modify the data, simply resize.
265                unsafe { s.chunks_mut().reserve(additional) }
266            }
267        }
268    }
269
270    /// Create a DataFrame from a Vector of Series.
271    ///
272    /// Errors if a column names are not unique, or if heights are not all equal.
273    ///
274    /// # Example
275    ///
276    /// ```
277    /// # use polars_core::prelude::*;
278    /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
279    /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
280    ///
281    /// let df = DataFrame::new(vec![s0, s1])?;
282    /// # Ok::<(), PolarsError>(())
283    /// ```
284    pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
285        DataFrame::validate_columns_slice(&columns)
286            .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
287        Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
288    }
289
290    pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
291        for col in &columns {
292            polars_ensure!(
293                col.len() == height,
294                ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
295                columns[0].name(), height, col.name(), col.len()
296            );
297        }
298
299        Ok(DataFrame {
300            height,
301            columns,
302            cached_schema: OnceLock::new(),
303        })
304    }
305
306    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
307    /// columns to match the other columns.
308    pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
309        // The length of the longest non-unit length column determines the
310        // broadcast length. If all columns are unit-length the broadcast length
311        // is one.
312        let broadcast_len = columns
313            .iter()
314            .map(|s| s.len())
315            .filter(|l| *l != 1)
316            .max()
317            .unwrap_or(1);
318        Self::new_with_broadcast_len(columns, broadcast_len)
319    }
320
321    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
322    /// columns to broadcast_len.
323    pub fn new_with_broadcast_len(
324        columns: Vec<Column>,
325        broadcast_len: usize,
326    ) -> PolarsResult<Self> {
327        ensure_names_unique(&columns, |s| s.name().as_str())?;
328        unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
329    }
330
331    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
332    /// columns to match the other columns.
333    ///  
334    /// # Safety
335    /// Does not check that the column names are unique (which they must be).
336    pub unsafe fn new_with_broadcast_no_namecheck(
337        mut columns: Vec<Column>,
338        broadcast_len: usize,
339    ) -> PolarsResult<Self> {
340        for col in &mut columns {
341            // Length not equal to the broadcast len, needs broadcast or is an error.
342            let len = col.len();
343            if len != broadcast_len {
344                if len != 1 {
345                    let name = col.name().to_owned();
346                    let extra_info =
347                        if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
348                            format!(" (matching column '{}')", c.name())
349                        } else {
350                            String::new()
351                        };
352                    polars_bail!(
353                        ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
354                    );
355                }
356                *col = col.new_from_index(0, broadcast_len);
357            }
358        }
359
360        let length = if columns.is_empty() { 0 } else { broadcast_len };
361
362        Ok(unsafe { DataFrame::new_no_checks(length, columns) })
363    }
364
365    pub fn new_from_index(&self, index: usize, height: usize) -> Self {
366        let cols = self.columns.iter().map(|c| c.new_from_index(index, height));
367        unsafe { Self::new_no_checks(height, cols.collect()) }
368    }
369
370    /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
371    ///
372    /// # Example
373    ///
374    /// ```rust
375    /// use polars_core::prelude::DataFrame;
376    /// static EMPTY: DataFrame = DataFrame::empty();
377    /// ```
378    pub const fn empty() -> Self {
379        Self::empty_with_height(0)
380    }
381
382    /// Creates an empty `DataFrame` with a specific `height`.
383    pub const fn empty_with_height(height: usize) -> Self {
384        DataFrame {
385            height,
386            columns: vec![],
387            cached_schema: OnceLock::new(),
388        }
389    }
390
391    /// Create an empty `DataFrame` with empty columns as per the `schema`.
392    pub fn empty_with_schema(schema: &Schema) -> Self {
393        let cols = schema
394            .iter()
395            .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
396            .collect();
397        unsafe { DataFrame::new_no_checks(0, cols) }
398    }
399
400    /// Create an empty `DataFrame` with empty columns as per the `schema`.
401    pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
402        let cols = schema
403            .iter_values()
404            .map(|fld| {
405                Column::from(Series::new_empty(
406                    fld.name.clone(),
407                    &(DataType::from_arrow_field(fld)),
408                ))
409            })
410            .collect();
411        unsafe { DataFrame::new_no_checks(0, cols) }
412    }
413
414    /// Create a new `DataFrame` with the given schema, only containing nulls.
415    pub fn full_null(schema: &Schema, height: usize) -> Self {
416        let columns = schema
417            .iter_fields()
418            .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
419            .collect();
420        unsafe { DataFrame::new_no_checks(height, columns) }
421    }
422
423    /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
424    ///
425    /// # Example
426    ///
427    /// ```rust
428    /// # use polars_core::prelude::*;
429    /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
430    /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
431    /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
432    ///
433    /// assert_eq!(df.pop(), Some(s2));
434    /// assert_eq!(df.pop(), Some(s1));
435    /// assert_eq!(df.pop(), None);
436    /// assert!(df.is_empty());
437    /// # Ok::<(), PolarsError>(())
438    /// ```
439    pub fn pop(&mut self) -> Option<Column> {
440        self.clear_schema();
441
442        self.columns.pop()
443    }
444
445    /// Add a new column at index 0 that counts the rows.
446    ///
447    /// # Example
448    ///
449    /// ```
450    /// # use polars_core::prelude::*;
451    /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
452    /// assert_eq!(df1.shape(), (4, 1));
453    ///
454    /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
455    /// assert_eq!(df2.shape(), (4, 2));
456    /// println!("{}", df2);
457    ///
458    /// # Ok::<(), PolarsError>(())
459    /// ```
460    ///
461    /// Output:
462    ///
463    /// ```text
464    ///  shape: (4, 2)
465    ///  +-----+----------+
466    ///  | Id  | Name     |
467    ///  | --- | ---      |
468    ///  | u32 | str      |
469    ///  +=====+==========+
470    ///  | 0   | James    |
471    ///  +-----+----------+
472    ///  | 1   | Mary     |
473    ///  +-----+----------+
474    ///  | 2   | John     |
475    ///  +-----+----------+
476    ///  | 3   | Patricia |
477    ///  +-----+----------+
478    /// ```
479    pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
480        let mut columns = Vec::with_capacity(self.columns.len() + 1);
481        let offset = offset.unwrap_or(0);
482
483        let col = Column::new_row_index(name, offset, self.height())?;
484        columns.push(col);
485        columns.extend_from_slice(&self.columns);
486        DataFrame::new(columns)
487    }
488
489    /// Add a row index column in place.
490    ///
491    /// # Safety
492    /// The caller should ensure the DataFrame does not already contain a column with the given name.
493    ///
494    /// # Panics
495    /// Panics if the resulting column would reach or overflow IdxSize::MAX.
496    pub unsafe fn with_row_index_mut(
497        &mut self,
498        name: PlSmallStr,
499        offset: Option<IdxSize>,
500    ) -> &mut Self {
501        // TODO: Make this function unsafe
502        debug_assert!(
503            self.columns.iter().all(|c| c.name() != &name),
504            "with_row_index_mut(): column with name {} already exists",
505            &name
506        );
507
508        let offset = offset.unwrap_or(0);
509        let col = Column::new_row_index(name, offset, self.height()).unwrap();
510
511        self.clear_schema();
512        self.columns.insert(0, col);
513        self
514    }
515
516    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
517    /// `Series`.
518    ///
519    /// Calculates the height from the first column or `0` if no columns are given.
520    ///
521    /// # Safety
522    ///
523    /// It is the callers responsibility to uphold the contract of all `Series`
524    /// having an equal length and a unique name, if not this may panic down the line.
525    pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
526        let height = columns.first().map_or(0, Column::len);
527        unsafe { Self::new_no_checks(height, columns) }
528    }
529
530    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
531    /// `Series`.
532    ///
533    /// It is advised to use [DataFrame::new] in favor of this method.
534    ///
535    /// # Safety
536    ///
537    /// It is the callers responsibility to uphold the contract of all `Series`
538    /// having an equal length and a unique name, if not this may panic down the line.
539    pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
540        if cfg!(debug_assertions) {
541            DataFrame::validate_columns_slice(&columns).unwrap();
542        }
543
544        unsafe { Self::_new_no_checks_impl(height, columns) }
545    }
546
547    /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
548    /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
549    /// constructed with this method is generally highly unsafe and should not be long-lived.
550    #[allow(clippy::missing_safety_doc)]
551    pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
552        DataFrame {
553            height,
554            columns,
555            cached_schema: OnceLock::new(),
556        }
557    }
558
559    /// Shrink the capacity of this DataFrame to fit its length.
560    pub fn shrink_to_fit(&mut self) {
561        // Don't parallelize this. Memory overhead
562        for s in &mut self.columns {
563            s.shrink_to_fit();
564        }
565    }
566
567    /// Aggregate all the chunks in the DataFrame to a single chunk.
568    pub fn as_single_chunk(&mut self) -> &mut Self {
569        // Don't parallelize this. Memory overhead
570        for s in &mut self.columns {
571            *s = s.rechunk();
572        }
573        self
574    }
575
576    /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
577    /// This may lead to more peak memory consumption.
578    pub fn as_single_chunk_par(&mut self) -> &mut Self {
579        if self.columns.iter().any(|c| c.n_chunks() > 1) {
580            self.columns = self._apply_columns_par(&|s| s.rechunk());
581        }
582        self
583    }
584
585    /// Rechunks all columns to only have a single chunk.
586    pub fn rechunk_mut(&mut self) {
587        // SAFETY: We never adjust the length or names of the columns.
588        let columns = unsafe { self.get_columns_mut() };
589
590        for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
591            *col = col.rechunk();
592        }
593    }
594
595    pub fn _deshare_views_mut(&mut self) {
596        // SAFETY: We never adjust the length or names of the columns.
597        unsafe {
598            let columns = self.get_columns_mut();
599            for col in columns {
600                let Column::Series(s) = col else { continue };
601
602                if let Ok(ca) = s.binary() {
603                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
604                    *col = Column::from(gc_ca.into_series());
605                } else if let Ok(ca) = s.str() {
606                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
607                    *col = Column::from(gc_ca.into_series());
608                }
609            }
610        }
611    }
612
613    /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
614    pub fn rechunk_to_record_batch(
615        self,
616        compat_level: CompatLevel,
617    ) -> RecordBatchT<Box<dyn Array>> {
618        let height = self.height();
619
620        let (schema, arrays) = self
621            .columns
622            .into_iter()
623            .map(|col| {
624                let mut series = col.take_materialized_series();
625                // Rechunk to one chunk if necessary
626                if series.n_chunks() > 1 {
627                    series = series.rechunk();
628                }
629                (
630                    series.field().to_arrow(compat_level),
631                    series.to_arrow(0, compat_level),
632                )
633            })
634            .collect();
635
636        RecordBatchT::new(height, Arc::new(schema), arrays)
637    }
638
639    /// Returns true if the chunks of the columns do not align and re-chunking should be done
640    pub fn should_rechunk(&self) -> bool {
641        // Fast check. It is also needed for correctness, as code below doesn't check if the number
642        // of chunks is equal.
643        if !self
644            .get_columns()
645            .iter()
646            .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
647            .all_equal()
648        {
649            return true;
650        }
651
652        // From here we check chunk lengths.
653        let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
654        match chunk_lengths.next() {
655            None => false,
656            Some(first_column_chunk_lengths) => {
657                // Fast Path for single Chunk Series
658                if first_column_chunk_lengths.size_hint().0 == 1 {
659                    return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
660                }
661                // Always rechunk if we have more chunks than rows.
662                // except when we have an empty df containing a single chunk
663                let height = self.height();
664                let n_chunks = first_column_chunk_lengths.size_hint().0;
665                if n_chunks > height && !(height == 0 && n_chunks == 1) {
666                    return true;
667                }
668                // Slow Path for multi Chunk series
669                let v: Vec<_> = first_column_chunk_lengths.collect();
670                for cl in chunk_lengths {
671                    if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
672                        return true;
673                    }
674                }
675                false
676            },
677        }
678    }
679
680    /// Ensure all the chunks in the [`DataFrame`] are aligned.
681    pub fn align_chunks_par(&mut self) -> &mut Self {
682        if self.should_rechunk() {
683            self.as_single_chunk_par()
684        } else {
685            self
686        }
687    }
688
689    pub fn align_chunks(&mut self) -> &mut Self {
690        if self.should_rechunk() {
691            self.as_single_chunk()
692        } else {
693            self
694        }
695    }
696
697    /// Get the [`DataFrame`] schema.
698    ///
699    /// # Example
700    ///
701    /// ```rust
702    /// # use polars_core::prelude::*;
703    /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
704    ///                         "Diameter (m)" => [8.8e26, f64::INFINITY])?;
705    ///
706    /// let f1: Field = Field::new("Thing".into(), DataType::String);
707    /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
708    /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
709    ///
710    /// assert_eq!(&**df.schema(), &sc);
711    /// # Ok::<(), PolarsError>(())
712    /// ```
713    pub fn schema(&self) -> &SchemaRef {
714        let out = self.cached_schema.get_or_init(|| {
715            Arc::new(
716                self.columns
717                    .iter()
718                    .map(|x| (x.name().clone(), x.dtype().clone()))
719                    .collect(),
720            )
721        });
722
723        debug_assert_eq!(out.len(), self.width());
724
725        out
726    }
727
728    /// Get a reference to the [`DataFrame`] columns.
729    ///
730    /// # Example
731    ///
732    /// ```rust
733    /// # use polars_core::prelude::*;
734    /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
735    ///                         "Symbol" => ["A", "C", "G", "T"])?;
736    /// let columns: &[Column] = df.get_columns();
737    ///
738    /// assert_eq!(columns[0].name(), "Name");
739    /// assert_eq!(columns[1].name(), "Symbol");
740    /// # Ok::<(), PolarsError>(())
741    /// ```
742    #[inline]
743    pub fn get_columns(&self) -> &[Column] {
744        &self.columns
745    }
746
747    #[inline]
748    /// Get mutable access to the underlying columns.
749    ///
750    /// # Safety
751    ///
752    /// The caller must ensure the length of all [`Series`] remains equal to `height` or
753    /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
754    /// The caller must ensure that the cached schema is cleared if it modifies the schema by
755    /// calling [`DataFrame::clear_schema`].
756    pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
757        &mut self.columns
758    }
759
760    #[inline]
761    /// Remove all the columns in the [`DataFrame`] but keep the `height`.
762    pub fn clear_columns(&mut self) {
763        unsafe { self.get_columns_mut() }.clear();
764        self.clear_schema();
765    }
766
767    #[inline]
768    /// Extend the columns without checking for name collisions or height.
769    ///
770    /// # Safety
771    ///
772    /// The caller needs to ensure that:
773    /// - Column names are unique within the resulting [`DataFrame`].
774    /// - The length of each appended column matches the height of the [`DataFrame`]. For
775    ///   `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
776    ///   with [`DataFrame::set_height`].
777    pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
778        unsafe { self.get_columns_mut() }.extend(iter);
779        self.clear_schema();
780    }
781
782    /// Take ownership of the underlying columns vec.
783    pub fn take_columns(self) -> Vec<Column> {
784        self.columns
785    }
786
787    /// Iterator over the columns as [`Series`].
788    ///
789    /// # Example
790    ///
791    /// ```rust
792    /// # use polars_core::prelude::*;
793    /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
794    /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
795    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
796    ///
797    /// let mut iterator = df.iter();
798    ///
799    /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
800    /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
801    /// assert_eq!(iterator.next(), None);
802    /// # Ok::<(), PolarsError>(())
803    /// ```
804    pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
805        self.materialized_column_iter()
806    }
807
808    /// # Example
809    ///
810    /// ```rust
811    /// # use polars_core::prelude::*;
812    /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
813    ///                         "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
814    ///
815    /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
816    /// # Ok::<(), PolarsError>(())
817    /// ```
818    pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
819        self.columns.iter().map(|s| s.name()).collect()
820    }
821
822    /// Get the [`Vec<PlSmallStr>`] representing the column names.
823    pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
824        self.columns.iter().map(|s| s.name().clone()).collect()
825    }
826
827    pub fn get_column_names_str(&self) -> Vec<&str> {
828        self.columns.iter().map(|s| s.name().as_str()).collect()
829    }
830
831    /// Set the column names.
832    /// # Example
833    ///
834    /// ```rust
835    /// # use polars_core::prelude::*;
836    /// let mut df: DataFrame = df!("Mathematical set" => ["ā„•", "ℤ", "š”»", "ā„š", "ā„", "ā„‚"])?;
837    /// df.set_column_names(["Set"])?;
838    ///
839    /// assert_eq!(df.get_column_names(), &["Set"]);
840    /// # Ok::<(), PolarsError>(())
841    /// ```
842    pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
843    where
844        I: IntoIterator<Item = S>,
845        S: Into<PlSmallStr>,
846    {
847        let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
848        self._set_column_names_impl(names.as_slice())
849    }
850
851    fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
852        polars_ensure!(
853            names.len() == self.width(),
854            ShapeMismatch: "{} column names provided for a DataFrame of width {}",
855            names.len(), self.width()
856        );
857        ensure_names_unique(names, |s| s.as_str())?;
858
859        let columns = mem::take(&mut self.columns);
860        self.columns = columns
861            .into_iter()
862            .zip(names)
863            .map(|(s, name)| {
864                let mut s = s;
865                s.rename(name.clone());
866                s
867            })
868            .collect();
869        self.clear_schema();
870        Ok(())
871    }
872
873    /// Get the data types of the columns in the [`DataFrame`].
874    ///
875    /// # Example
876    ///
877    /// ```rust
878    /// # use polars_core::prelude::*;
879    /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
880    ///                                "Fraction" => [0.965, 0.035])?;
881    ///
882    /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
883    /// # Ok::<(), PolarsError>(())
884    /// ```
885    pub fn dtypes(&self) -> Vec<DataType> {
886        self.columns.iter().map(|s| s.dtype().clone()).collect()
887    }
888
889    pub(crate) fn first_series_column(&self) -> Option<&Series> {
890        self.columns.iter().find_map(|col| col.as_series())
891    }
892
893    /// The number of chunks for the first column.
894    pub fn first_col_n_chunks(&self) -> usize {
895        match self.first_series_column() {
896            None if self.columns.is_empty() => 0,
897            None => 1,
898            Some(s) => s.n_chunks(),
899        }
900    }
901
902    /// The highest number of chunks for any column.
903    pub fn max_n_chunks(&self) -> usize {
904        self.columns
905            .iter()
906            .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
907            .max()
908            .unwrap_or(0)
909    }
910
911    /// Get a reference to the schema fields of the [`DataFrame`].
912    ///
913    /// # Example
914    ///
915    /// ```rust
916    /// # use polars_core::prelude::*;
917    /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
918    ///                            "Fraction" => [0.708, 0.292])?;
919    ///
920    /// let f1: Field = Field::new("Surface type".into(), DataType::String);
921    /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
922    ///
923    /// assert_eq!(earth.fields(), &[f1, f2]);
924    /// # Ok::<(), PolarsError>(())
925    /// ```
926    pub fn fields(&self) -> Vec<Field> {
927        self.columns
928            .iter()
929            .map(|s| s.field().into_owned())
930            .collect()
931    }
932
933    /// Get (height, width) of the [`DataFrame`].
934    ///
935    /// # Example
936    ///
937    /// ```rust
938    /// # use polars_core::prelude::*;
939    /// let df0: DataFrame = DataFrame::default();
940    /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
941    /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
942    ///                          "2" => [1, 2, 3, 4, 5])?;
943    ///
944    /// assert_eq!(df0.shape(), (0 ,0));
945    /// assert_eq!(df1.shape(), (5, 1));
946    /// assert_eq!(df2.shape(), (5, 2));
947    /// # Ok::<(), PolarsError>(())
948    /// ```
949    pub fn shape(&self) -> (usize, usize) {
950        (self.height, self.columns.len())
951    }
952
953    /// Get the width of the [`DataFrame`] which is the number of columns.
954    ///
955    /// # Example
956    ///
957    /// ```rust
958    /// # use polars_core::prelude::*;
959    /// let df0: DataFrame = DataFrame::default();
960    /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
961    /// let df2: DataFrame = df!("Series 1" => [0; 0],
962    ///                          "Series 2" => [0; 0])?;
963    ///
964    /// assert_eq!(df0.width(), 0);
965    /// assert_eq!(df1.width(), 1);
966    /// assert_eq!(df2.width(), 2);
967    /// # Ok::<(), PolarsError>(())
968    /// ```
969    pub fn width(&self) -> usize {
970        self.columns.len()
971    }
972
973    /// Get the height of the [`DataFrame`] which is the number of rows.
974    ///
975    /// # Example
976    ///
977    /// ```rust
978    /// # use polars_core::prelude::*;
979    /// let df0: DataFrame = DataFrame::default();
980    /// let df1: DataFrame = df!("Currency" => ["€", "$"])?;
981    /// let df2: DataFrame = df!("Currency" => ["€", "$", "Ā„", "Ā£", "₿"])?;
982    ///
983    /// assert_eq!(df0.height(), 0);
984    /// assert_eq!(df1.height(), 2);
985    /// assert_eq!(df2.height(), 5);
986    /// # Ok::<(), PolarsError>(())
987    /// ```
988    pub fn height(&self) -> usize {
989        self.height
990    }
991
992    /// Returns the size as number of rows * number of columns
993    pub fn size(&self) -> usize {
994        let s = self.shape();
995        s.0 * s.1
996    }
997
998    /// Returns `true` if the [`DataFrame`] contains no rows.
999    ///
1000    /// # Example
1001    ///
1002    /// ```rust
1003    /// # use polars_core::prelude::*;
1004    /// let df1: DataFrame = DataFrame::default();
1005    /// assert!(df1.is_empty());
1006    ///
1007    /// let df2: DataFrame = df!("First name" => ["Forever"],
1008    ///                          "Last name" => ["Alone"])?;
1009    /// assert!(!df2.is_empty());
1010    /// # Ok::<(), PolarsError>(())
1011    /// ```
1012    pub fn is_empty(&self) -> bool {
1013        matches!(self.shape(), (0, _) | (_, 0))
1014    }
1015
1016    /// Set the height (i.e. number of rows) of this [`DataFrame`].
1017    ///
1018    /// # Safety
1019    ///
1020    /// This needs to be equal to the length of all the columns.
1021    pub unsafe fn set_height(&mut self, height: usize) {
1022        self.height = height;
1023    }
1024
1025    /// Add multiple [`Series`] to a [`DataFrame`].
1026    /// The added `Series` are required to have the same length.
1027    ///
1028    /// # Example
1029    ///
1030    /// ```rust
1031    /// # use polars_core::prelude::*;
1032    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1033    /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1034    /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1035    ///
1036    /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1037    /// assert_eq!(df2.shape(), (3, 3));
1038    /// println!("{}", df2);
1039    /// # Ok::<(), PolarsError>(())
1040    /// ```
1041    ///
1042    /// Output:
1043    ///
1044    /// ```text
1045    /// shape: (3, 3)
1046    /// +---------+--------+----------+
1047    /// | Element | Proton | Electron |
1048    /// | ---     | ---    | ---      |
1049    /// | str     | i32    | i32      |
1050    /// +=========+========+==========+
1051    /// | Copper  | 29     | 29       |
1052    /// +---------+--------+----------+
1053    /// | Silver  | 47     | 47       |
1054    /// +---------+--------+----------+
1055    /// | Gold    | 79     | 79       |
1056    /// +---------+--------+----------+
1057    /// ```
1058    pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1059        let mut new_cols = self.columns.clone();
1060        new_cols.extend_from_slice(columns);
1061        DataFrame::new(new_cols)
1062    }
1063
1064    /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1065    ///
1066    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1067    ///
1068    /// # Example
1069    ///
1070    /// ```rust
1071    /// # use polars_core::prelude::*;
1072    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1073    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1074    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1075    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1076    ///
1077    /// let df3: DataFrame = df1.vstack(&df2)?;
1078    ///
1079    /// assert_eq!(df3.shape(), (5, 2));
1080    /// println!("{}", df3);
1081    /// # Ok::<(), PolarsError>(())
1082    /// ```
1083    ///
1084    /// Output:
1085    ///
1086    /// ```text
1087    /// shape: (5, 2)
1088    /// +-----------+-------------------+
1089    /// | Element   | Melting Point (K) |
1090    /// | ---       | ---               |
1091    /// | str       | f64               |
1092    /// +===========+===================+
1093    /// | Copper    | 1357.77           |
1094    /// +-----------+-------------------+
1095    /// | Silver    | 1234.93           |
1096    /// +-----------+-------------------+
1097    /// | Gold      | 1337.33           |
1098    /// +-----------+-------------------+
1099    /// | Platinum  | 2041.4            |
1100    /// +-----------+-------------------+
1101    /// | Palladium | 1828.05           |
1102    /// +-----------+-------------------+
1103    /// ```
1104    pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1105        let mut df = self.clone();
1106        df.vstack_mut(other)?;
1107        Ok(df)
1108    }
1109
1110    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1111    ///
1112    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1113    ///
1114    /// # Example
1115    ///
1116    /// ```rust
1117    /// # use polars_core::prelude::*;
1118    /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1119    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1120    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1121    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1122    ///
1123    /// df1.vstack_mut(&df2)?;
1124    ///
1125    /// assert_eq!(df1.shape(), (5, 2));
1126    /// println!("{}", df1);
1127    /// # Ok::<(), PolarsError>(())
1128    /// ```
1129    ///
1130    /// Output:
1131    ///
1132    /// ```text
1133    /// shape: (5, 2)
1134    /// +-----------+-------------------+
1135    /// | Element   | Melting Point (K) |
1136    /// | ---       | ---               |
1137    /// | str       | f64               |
1138    /// +===========+===================+
1139    /// | Copper    | 1357.77           |
1140    /// +-----------+-------------------+
1141    /// | Silver    | 1234.93           |
1142    /// +-----------+-------------------+
1143    /// | Gold      | 1337.33           |
1144    /// +-----------+-------------------+
1145    /// | Platinum  | 2041.4            |
1146    /// +-----------+-------------------+
1147    /// | Palladium | 1828.05           |
1148    /// +-----------+-------------------+
1149    /// ```
1150    pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1151        if self.width() != other.width() {
1152            polars_ensure!(
1153                self.width() == 0,
1154                ShapeMismatch:
1155                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1156                self.width(), other.width(),
1157            );
1158            self.columns.clone_from(&other.columns);
1159            self.height = other.height;
1160            return Ok(self);
1161        }
1162
1163        self.columns
1164            .iter_mut()
1165            .zip(other.columns.iter())
1166            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1167                ensure_can_extend(&*left, right)?;
1168                left.append(right).map_err(|e| {
1169                    e.context(format!("failed to vstack column '{}'", right.name()).into())
1170                })?;
1171                Ok(())
1172            })?;
1173        self.height += other.height;
1174        Ok(self)
1175    }
1176
1177    pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1178        if self.width() != other.width() {
1179            polars_ensure!(
1180                self.width() == 0,
1181                ShapeMismatch:
1182                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1183                self.width(), other.width(),
1184            );
1185            self.columns = other.columns;
1186            self.height = other.height;
1187            return Ok(self);
1188        }
1189
1190        self.columns
1191            .iter_mut()
1192            .zip(other.columns.into_iter())
1193            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1194                ensure_can_extend(&*left, &right)?;
1195                let right_name = right.name().clone();
1196                left.append_owned(right).map_err(|e| {
1197                    e.context(format!("failed to vstack column '{right_name}'").into())
1198                })?;
1199                Ok(())
1200            })?;
1201        self.height += other.height;
1202        Ok(self)
1203    }
1204
1205    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1206    ///
1207    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1208    ///
1209    /// # Panics
1210    /// Panics if the schema's don't match.
1211    pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1212        self.columns
1213            .iter_mut()
1214            .zip(other.columns.iter())
1215            .for_each(|(left, right)| {
1216                left.append(right)
1217                    .map_err(|e| {
1218                        e.context(format!("failed to vstack column '{}'", right.name()).into())
1219                    })
1220                    .expect("should not fail");
1221            });
1222        self.height += other.height;
1223    }
1224
1225    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1226    ///
1227    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1228    ///
1229    /// # Panics
1230    /// Panics if the schema's don't match.
1231    pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1232        self.columns
1233            .iter_mut()
1234            .zip(other.columns)
1235            .for_each(|(left, right)| {
1236                left.append_owned(right).expect("should not fail");
1237            });
1238        self.height += other.height;
1239    }
1240
1241    /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1242    ///
1243    /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1244    /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1245    ///
1246    /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1247    /// and thus will yield faster queries.
1248    ///
1249    /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1250    /// online operations where you add `n` rows and rerun a query.
1251    ///
1252    /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1253    /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1254    /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1255    pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1256        polars_ensure!(
1257            self.width() == other.width(),
1258            ShapeMismatch:
1259            "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1260            self.width(), other.width(),
1261        );
1262
1263        self.columns
1264            .iter_mut()
1265            .zip(other.columns.iter())
1266            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1267                ensure_can_extend(&*left, right)?;
1268                left.extend(right).map_err(|e| {
1269                    e.context(format!("failed to extend column '{}'", right.name()).into())
1270                })?;
1271                Ok(())
1272            })?;
1273        self.height += other.height;
1274        self.clear_schema();
1275        Ok(())
1276    }
1277
1278    /// Remove a column by name and return the column removed.
1279    ///
1280    /// # Example
1281    ///
1282    /// ```rust
1283    /// # use polars_core::prelude::*;
1284    /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1285    ///                             "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1286    ///
1287    /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1288    /// assert!(s1.is_err());
1289    ///
1290    /// let s2: Column = df.drop_in_place("Animal")?;
1291    /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1292    /// # Ok::<(), PolarsError>(())
1293    /// ```
1294    pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1295        let idx = self.check_name_to_idx(name)?;
1296        self.clear_schema();
1297        Ok(self.columns.remove(idx))
1298    }
1299
1300    /// Return a new [`DataFrame`] where all null values are dropped.
1301    ///
1302    /// # Example
1303    ///
1304    /// ```no_run
1305    /// # use polars_core::prelude::*;
1306    /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1307    ///                         "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1308    /// assert_eq!(df1.shape(), (3, 2));
1309    ///
1310    /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1311    /// assert_eq!(df2.shape(), (1, 2));
1312    /// println!("{}", df2);
1313    /// # Ok::<(), PolarsError>(())
1314    /// ```
1315    ///
1316    /// Output:
1317    ///
1318    /// ```text
1319    /// shape: (1, 2)
1320    /// +---------+---------------------+
1321    /// | Country | Tax revenue (% GDP) |
1322    /// | ---     | ---                 |
1323    /// | str     | f64                 |
1324    /// +=========+=====================+
1325    /// | Malta   | 32.7                |
1326    /// +---------+---------------------+
1327    /// ```
1328    pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1329    where
1330        for<'a> &'a S: Into<PlSmallStr>,
1331    {
1332        if let Some(v) = subset {
1333            let v = self.select_columns(v)?;
1334            self._drop_nulls_impl(v.as_slice())
1335        } else {
1336            self._drop_nulls_impl(self.columns.as_slice())
1337        }
1338    }
1339
1340    fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1341        // fast path for no nulls in df
1342        if subset.iter().all(|s| !s.has_nulls()) {
1343            return Ok(self.clone());
1344        }
1345
1346        let mut iter = subset.iter();
1347
1348        let mask = iter
1349            .next()
1350            .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1351        let mut mask = mask.is_not_null();
1352
1353        for c in iter {
1354            mask = mask & c.is_not_null();
1355        }
1356        self.filter(&mask)
1357    }
1358
1359    /// Drop a column by name.
1360    /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1361    /// the current one in place.
1362    ///
1363    /// # Example
1364    ///
1365    /// ```rust
1366    /// # use polars_core::prelude::*;
1367    /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1368    /// let df2: DataFrame = df1.drop("Ray type")?;
1369    ///
1370    /// assert!(df2.is_empty());
1371    /// # Ok::<(), PolarsError>(())
1372    /// ```
1373    pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1374        let idx = self.check_name_to_idx(name)?;
1375        let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1376
1377        self.columns.iter().enumerate().for_each(|(i, s)| {
1378            if i != idx {
1379                new_cols.push(s.clone())
1380            }
1381        });
1382
1383        Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1384    }
1385
1386    /// Drop columns that are in `names`.
1387    pub fn drop_many<I, S>(&self, names: I) -> Self
1388    where
1389        I: IntoIterator<Item = S>,
1390        S: Into<PlSmallStr>,
1391    {
1392        let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1393        self.drop_many_amortized(&names)
1394    }
1395
1396    /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1397    pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1398        if names.is_empty() {
1399            return self.clone();
1400        }
1401        let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1402        self.columns.iter().for_each(|s| {
1403            if !names.contains(s.name()) {
1404                new_cols.push(s.clone())
1405            }
1406        });
1407
1408        unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1409    }
1410
1411    /// Insert a new column at a given index without checking for duplicates.
1412    /// This can leave the [`DataFrame`] at an invalid state
1413    fn insert_column_no_name_check(
1414        &mut self,
1415        index: usize,
1416        column: Column,
1417    ) -> PolarsResult<&mut Self> {
1418        polars_ensure!(
1419            self.width() == 0 || column.len() == self.height(),
1420            ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1421            column.len(), self.height(),
1422        );
1423
1424        if self.width() == 0 {
1425            self.height = column.len();
1426        }
1427
1428        self.columns.insert(index, column);
1429        self.clear_schema();
1430        Ok(self)
1431    }
1432
1433    /// Insert a new column at a given index.
1434    pub fn insert_column<S: IntoColumn>(
1435        &mut self,
1436        index: usize,
1437        column: S,
1438    ) -> PolarsResult<&mut Self> {
1439        let column = column.into_column();
1440        self.check_already_present(column.name().as_str())?;
1441        self.insert_column_no_name_check(index, column)
1442    }
1443
1444    fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1445        if let Some(idx) = self.get_column_index(column.name().as_str()) {
1446            self.replace_column(idx, column)?;
1447        } else {
1448            if self.width() == 0 {
1449                self.height = column.len();
1450            }
1451
1452            self.columns.push(column);
1453            self.clear_schema();
1454        }
1455        Ok(())
1456    }
1457
1458    /// Add a new column to this [`DataFrame`] or replace an existing one.
1459    pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1460        fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1461            let height = df.height();
1462            if column.len() == 1 && height > 1 {
1463                column = column.new_from_index(0, height);
1464            }
1465
1466            if column.len() == height || df.get_columns().is_empty() {
1467                df.add_column_by_search(column)?;
1468                Ok(df)
1469            }
1470            // special case for literals
1471            else if height == 0 && column.len() == 1 {
1472                let s = column.clear();
1473                df.add_column_by_search(s)?;
1474                Ok(df)
1475            } else {
1476                polars_bail!(
1477                    ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1478                    column.len(), height,
1479                );
1480            }
1481        }
1482        let column = column.into_column();
1483        inner(self, column)
1484    }
1485
1486    /// Adds a column to the [`DataFrame`] without doing any checks
1487    /// on length or duplicates.
1488    ///
1489    /// # Safety
1490    /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1491    pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1492        debug_assert!(self.width() == 0 || self.height() == column.len());
1493        debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1494
1495        // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1496        // properly for `width` == 0.
1497        if self.width() == 0 {
1498            unsafe { self.set_height(column.len()) };
1499        }
1500        unsafe { self.get_columns_mut() }.push(column);
1501        self.clear_schema();
1502
1503        self
1504    }
1505
1506    // Note: Schema can be both input or output_schema
1507    fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1508        let name = c.name();
1509        if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1510            if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1511                // Given schema is output_schema and we can push.
1512                if idx == self.columns.len() {
1513                    if self.width() == 0 {
1514                        self.height = c.len();
1515                    }
1516
1517                    self.columns.push(c);
1518                    self.clear_schema();
1519                }
1520                // Schema is incorrect fallback to search
1521                else {
1522                    debug_assert!(false);
1523                    self.add_column_by_search(c)?;
1524                }
1525            } else {
1526                self.replace_column(idx, c)?;
1527            }
1528        } else {
1529            if self.width() == 0 {
1530                self.height = c.len();
1531            }
1532
1533            self.columns.push(c);
1534            self.clear_schema();
1535        }
1536
1537        Ok(())
1538    }
1539
1540    // Note: Schema can be both input or output_schema
1541    pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1542        for (i, s) in series.into_iter().enumerate() {
1543            // we need to branch here
1544            // because users can add multiple columns with the same name
1545            if i == 0 || schema.get(s.name().as_str()).is_some() {
1546                self.with_column_and_schema(s.into_column(), schema)?;
1547            } else {
1548                self.with_column(s.clone().into_column())?;
1549            }
1550        }
1551        Ok(())
1552    }
1553
1554    pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1555        for (i, s) in columns.into_iter().enumerate() {
1556            // we need to branch here
1557            // because users can add multiple columns with the same name
1558            if i == 0 || schema.get(s.name().as_str()).is_some() {
1559                self.with_column_and_schema(s, schema)?;
1560            } else {
1561                self.with_column(s.clone())?;
1562            }
1563        }
1564
1565        Ok(())
1566    }
1567
1568    /// Add a new column to this [`DataFrame`] or replace an existing one.
1569    /// Uses an existing schema to amortize lookups.
1570    /// If the schema is incorrect, we will fallback to linear search.
1571    ///
1572    /// Note: Schema can be both input or output_schema
1573    pub fn with_column_and_schema<C: IntoColumn>(
1574        &mut self,
1575        column: C,
1576        schema: &Schema,
1577    ) -> PolarsResult<&mut Self> {
1578        let mut column = column.into_column();
1579
1580        let height = self.height();
1581        if column.len() == 1 && height > 1 {
1582            column = column.new_from_index(0, height);
1583        }
1584
1585        if column.len() == height || self.columns.is_empty() {
1586            self.add_column_by_schema(column, schema)?;
1587            Ok(self)
1588        }
1589        // special case for literals
1590        else if height == 0 && column.len() == 1 {
1591            let s = column.clear();
1592            self.add_column_by_schema(s, schema)?;
1593            Ok(self)
1594        } else {
1595            polars_bail!(
1596                ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1597                column.len(), height,
1598            );
1599        }
1600    }
1601
1602    /// Get a row in the [`DataFrame`]. Beware this is slow.
1603    ///
1604    /// # Example
1605    ///
1606    /// ```
1607    /// # use polars_core::prelude::*;
1608    /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1609    ///     df.get(idx)
1610    /// }
1611    /// ```
1612    pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {
1613        match self.columns.first() {
1614            Some(s) => {
1615                if s.len() <= idx {
1616                    return None;
1617                }
1618            },
1619            None => return None,
1620        }
1621        // SAFETY: we just checked bounds
1622        unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1623    }
1624
1625    /// Select a [`Series`] by index.
1626    ///
1627    /// # Example
1628    ///
1629    /// ```rust
1630    /// # use polars_core::prelude::*;
1631    /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1632    ///                         "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1633    ///
1634    /// let s1: Option<&Column> = df.select_at_idx(0);
1635    /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1636    ///
1637    /// assert_eq!(s1, Some(&s2));
1638    /// # Ok::<(), PolarsError>(())
1639    /// ```
1640    pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1641        self.columns.get(idx)
1642    }
1643
1644    /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1645    ///
1646    /// # Examples
1647    ///
1648    /// ```rust
1649    /// # use polars_core::prelude::*;
1650    /// let df = df! {
1651    ///     "0" => [0, 0, 0],
1652    ///     "1" => [1, 1, 1],
1653    ///     "2" => [2, 2, 2]
1654    /// }?;
1655    ///
1656    /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1657    /// assert!(df.equals(&df.select_by_range(..)?));
1658    /// # Ok::<(), PolarsError>(())
1659    /// ```
1660    pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1661    where
1662        R: ops::RangeBounds<usize>,
1663    {
1664        // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1665        // because it is the nightly feature. We should change here if this function were stable.
1666        fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1667        where
1668            R: ops::RangeBounds<usize>,
1669        {
1670            let len = bounds.end;
1671
1672            let start: ops::Bound<&usize> = range.start_bound();
1673            let start = match start {
1674                ops::Bound::Included(&start) => start,
1675                ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1676                    panic!("attempted to index slice from after maximum usize");
1677                }),
1678                ops::Bound::Unbounded => 0,
1679            };
1680
1681            let end: ops::Bound<&usize> = range.end_bound();
1682            let end = match end {
1683                ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1684                    panic!("attempted to index slice up to maximum usize");
1685                }),
1686                ops::Bound::Excluded(&end) => end,
1687                ops::Bound::Unbounded => len,
1688            };
1689
1690            if start > end {
1691                panic!("slice index starts at {start} but ends at {end}");
1692            }
1693            if end > len {
1694                panic!("range end index {end} out of range for slice of length {len}",);
1695            }
1696
1697            ops::Range { start, end }
1698        }
1699
1700        let colnames = self.get_column_names_owned();
1701        let range = get_range(range, ..colnames.len());
1702
1703        self._select_impl(&colnames[range])
1704    }
1705
1706    /// Get column index of a [`Series`] by name.
1707    /// # Example
1708    ///
1709    /// ```rust
1710    /// # use polars_core::prelude::*;
1711    /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1712    ///                         "Health" => [100, 200, 500],
1713    ///                         "Mana" => [250, 100, 0],
1714    ///                         "Strength" => [30, 150, 300])?;
1715    ///
1716    /// assert_eq!(df.get_column_index("Name"), Some(0));
1717    /// assert_eq!(df.get_column_index("Health"), Some(1));
1718    /// assert_eq!(df.get_column_index("Mana"), Some(2));
1719    /// assert_eq!(df.get_column_index("Strength"), Some(3));
1720    /// assert_eq!(df.get_column_index("Haste"), None);
1721    /// # Ok::<(), PolarsError>(())
1722    /// ```
1723    pub fn get_column_index(&self, name: &str) -> Option<usize> {
1724        let schema = self.schema();
1725        if let Some(idx) = schema.index_of(name) {
1726            if self
1727                .get_columns()
1728                .get(idx)
1729                .is_some_and(|c| c.name() == name)
1730            {
1731                return Some(idx);
1732            }
1733        }
1734
1735        self.columns.iter().position(|s| s.name().as_str() == name)
1736    }
1737
1738    /// Get column index of a [`Series`] by name.
1739    pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1740        self.get_column_index(name)
1741            .ok_or_else(|| polars_err!(col_not_found = name))
1742    }
1743
1744    /// Select a single column by name.
1745    ///
1746    /// # Example
1747    ///
1748    /// ```rust
1749    /// # use polars_core::prelude::*;
1750    /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1751    /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1752    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1753    ///
1754    /// assert_eq!(df.column("Password")?, &s1);
1755    /// # Ok::<(), PolarsError>(())
1756    /// ```
1757    pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1758        let idx = self.try_get_column_index(name)?;
1759        Ok(self.select_at_idx(idx).unwrap())
1760    }
1761
1762    /// Selected multiple columns by name.
1763    ///
1764    /// # Example
1765    ///
1766    /// ```rust
1767    /// # use polars_core::prelude::*;
1768    /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1769    ///                         "Max weight (kg)" => [16.0, 35.89])?;
1770    /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1771    ///
1772    /// assert_eq!(&df[0], sv[0]);
1773    /// assert_eq!(&df[1], sv[1]);
1774    /// # Ok::<(), PolarsError>(())
1775    /// ```
1776    pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1777    where
1778        I: IntoIterator<Item = S>,
1779        S: AsRef<str>,
1780    {
1781        names
1782            .into_iter()
1783            .map(|name| self.column(name.as_ref()))
1784            .collect()
1785    }
1786
1787    /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1788    ///
1789    /// # Examples
1790    ///
1791    /// ```
1792    /// # use polars_core::prelude::*;
1793    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1794    ///     df.select(["foo", "bar"])
1795    /// }
1796    /// ```
1797    pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1798    where
1799        I: IntoIterator<Item = S>,
1800        S: Into<PlSmallStr>,
1801    {
1802        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1803        self._select_impl(cols.as_slice())
1804    }
1805
1806    pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1807        ensure_names_unique(cols, |s| s.as_str())?;
1808        self._select_impl_unchecked(cols)
1809    }
1810
1811    pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1812        let selected = self.select_columns_impl(cols)?;
1813        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1814    }
1815
1816    /// Select with a known schema. The schema names must match the column names of this DataFrame.
1817    pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1818    where
1819        I: IntoIterator<Item = S>,
1820        S: Into<PlSmallStr>,
1821    {
1822        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1823        self._select_with_schema_impl(&cols, schema, true)
1824    }
1825
1826    /// Select with a known schema without checking for duplicates in `selection`.
1827    /// The schema names must match the column names of this DataFrame.
1828    pub fn select_with_schema_unchecked<I, S>(
1829        &self,
1830        selection: I,
1831        schema: &Schema,
1832    ) -> PolarsResult<Self>
1833    where
1834        I: IntoIterator<Item = S>,
1835        S: Into<PlSmallStr>,
1836    {
1837        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1838        self._select_with_schema_impl(&cols, schema, false)
1839    }
1840
1841    /// * The schema names must match the column names of this DataFrame.
1842    pub fn _select_with_schema_impl(
1843        &self,
1844        cols: &[PlSmallStr],
1845        schema: &Schema,
1846        check_duplicates: bool,
1847    ) -> PolarsResult<Self> {
1848        if check_duplicates {
1849            ensure_names_unique(cols, |s| s.as_str())?;
1850        }
1851
1852        let selected = self.select_columns_impl_with_schema(cols, schema)?;
1853        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1854    }
1855
1856    /// A non generic implementation to reduce compiler bloat.
1857    fn select_columns_impl_with_schema(
1858        &self,
1859        cols: &[PlSmallStr],
1860        schema: &Schema,
1861    ) -> PolarsResult<Vec<Column>> {
1862        if cfg!(debug_assertions) {
1863            ensure_matching_schema_names(schema, self.schema())?;
1864        }
1865
1866        cols.iter()
1867            .map(|name| {
1868                let index = schema.try_get_full(name.as_str())?.0;
1869                Ok(self.columns[index].clone())
1870            })
1871            .collect()
1872    }
1873
1874    pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1875    where
1876        I: IntoIterator<Item = S>,
1877        S: Into<PlSmallStr>,
1878    {
1879        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1880        self.select_physical_impl(&cols)
1881    }
1882
1883    fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1884        ensure_names_unique(cols, |s| s.as_str())?;
1885        let selected = self.select_columns_physical_impl(cols)?;
1886        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1887    }
1888
1889    pub fn project(&self, to: SchemaRef) -> PolarsResult<Self> {
1890        let from = self.schema();
1891        let columns = to
1892            .iter_names()
1893            .map(|name| Ok(self.columns[from.try_index_of(name.as_str())?].clone()))
1894            .collect::<PolarsResult<Vec<_>>>()?;
1895        let mut df = unsafe { Self::new_no_checks(self.height(), columns) };
1896        df.cached_schema = to.into();
1897        Ok(df)
1898    }
1899
1900    /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1901    ///
1902    /// # Example
1903    ///
1904    /// ```rust
1905    /// # use polars_core::prelude::*;
1906    /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1907    ///                         "Carbon" => [1, 2, 3],
1908    ///                         "Hydrogen" => [4, 6, 8])?;
1909    /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1910    ///
1911    /// assert_eq!(df["Carbon"], sv[0]);
1912    /// assert_eq!(df["Hydrogen"], sv[1]);
1913    /// # Ok::<(), PolarsError>(())
1914    /// ```
1915    pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1916        let cols = selection.into_vec();
1917        self.select_columns_impl(&cols)
1918    }
1919
1920    fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1921        self.columns
1922            .iter()
1923            .enumerate()
1924            .map(|(i, s)| (s.name().as_str(), i))
1925            .collect()
1926    }
1927
1928    /// A non generic implementation to reduce compiler bloat.
1929    fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1930        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1931            let name_to_idx = self._names_to_idx_map();
1932            cols.iter()
1933                .map(|name| {
1934                    let idx = *name_to_idx
1935                        .get(name.as_str())
1936                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1937                    Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1938                })
1939                .collect::<PolarsResult<Vec<_>>>()?
1940        } else {
1941            cols.iter()
1942                .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1943                .collect::<PolarsResult<Vec<_>>>()?
1944        };
1945
1946        Ok(selected)
1947    }
1948
1949    /// A non generic implementation to reduce compiler bloat.
1950    fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1951        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1952            // we hash, because there are user that having millions of columns.
1953            // # https://github.com/pola-rs/polars/issues/1023
1954            let name_to_idx = self._names_to_idx_map();
1955
1956            cols.iter()
1957                .map(|name| {
1958                    let idx = *name_to_idx
1959                        .get(name.as_str())
1960                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1961                    Ok(self.select_at_idx(idx).unwrap().clone())
1962                })
1963                .collect::<PolarsResult<Vec<_>>>()?
1964        } else {
1965            cols.iter()
1966                .map(|c| self.column(c.as_str()).cloned())
1967                .collect::<PolarsResult<Vec<_>>>()?
1968        };
1969
1970        Ok(selected)
1971    }
1972
1973    fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1974        // If there is a filtered column just see how many columns there are left.
1975        if let Some(fst) = filtered.first() {
1976            return fst.len();
1977        }
1978
1979        // Otherwise, count the number of values that would be filtered and return that height.
1980        let num_trues = mask.num_trues();
1981        if mask.len() == self.height() {
1982            num_trues
1983        } else {
1984            // This is for broadcasting masks
1985            debug_assert!(num_trues == 0 || num_trues == 1);
1986            self.height() * num_trues
1987        }
1988    }
1989
1990    /// Take the [`DataFrame`] rows by a boolean mask.
1991    ///
1992    /// # Example
1993    ///
1994    /// ```
1995    /// # use polars_core::prelude::*;
1996    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1997    ///     let mask = df.column("sepal_width")?.is_not_null();
1998    ///     df.filter(&mask)
1999    /// }
2000    /// ```
2001    pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2002        let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
2003        let height = self.filter_height(&new_col, mask);
2004
2005        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2006    }
2007
2008    /// Same as `filter` but does not parallelize.
2009    pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2010        let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
2011        let height = self.filter_height(&new_col, mask);
2012
2013        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2014    }
2015
2016    /// Take [`DataFrame`] rows by index values.
2017    ///
2018    /// # Example
2019    ///
2020    /// ```
2021    /// # use polars_core::prelude::*;
2022    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2023    ///     let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2024    ///     df.take(&idx)
2025    /// }
2026    /// ```
2027    pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2028        let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2029
2030        Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2031    }
2032
2033    /// # Safety
2034    /// The indices must be in-bounds.
2035    pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2036        self.take_unchecked_impl(idx, true)
2037    }
2038
2039    /// # Safety
2040    /// The indices must be in-bounds.
2041    pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2042        let cols = if allow_threads {
2043            POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))
2044        } else {
2045            self._apply_columns(&|s| s.take_unchecked(idx))
2046        };
2047        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2048    }
2049
2050    /// # Safety
2051    /// The indices must be in-bounds.
2052    pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2053        self.take_slice_unchecked_impl(idx, true)
2054    }
2055
2056    /// # Safety
2057    /// The indices must be in-bounds.
2058    pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2059        let cols = if allow_threads {
2060            POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))
2061        } else {
2062            self._apply_columns(&|s| s.take_slice_unchecked(idx))
2063        };
2064        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2065    }
2066
2067    /// Rename a column in the [`DataFrame`].
2068    ///
2069    /// # Example
2070    ///
2071    /// ```
2072    /// # use polars_core::prelude::*;
2073    /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2074    ///     let original_name = "foo";
2075    ///     let new_name = "bar";
2076    ///     df.rename(original_name, new_name.into())
2077    /// }
2078    /// ```
2079    pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2080        if column == name.as_str() {
2081            return Ok(self);
2082        }
2083        polars_ensure!(
2084            !self.schema().contains(&name),
2085            Duplicate: "column rename attempted with already existing name \"{name}\""
2086        );
2087
2088        self.get_column_index(column)
2089            .and_then(|idx| self.columns.get_mut(idx))
2090            .ok_or_else(|| polars_err!(col_not_found = column))
2091            .map(|c| c.rename(name))?;
2092        self.clear_schema();
2093
2094        Ok(self)
2095    }
2096
2097    /// Sort [`DataFrame`] in place.
2098    ///
2099    /// See [`DataFrame::sort`] for more instruction.
2100    pub fn sort_in_place(
2101        &mut self,
2102        by: impl IntoVec<PlSmallStr>,
2103        sort_options: SortMultipleOptions,
2104    ) -> PolarsResult<&mut Self> {
2105        let by_column = self.select_columns(by)?;
2106        self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2107        Ok(self)
2108    }
2109
2110    #[doc(hidden)]
2111    /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2112    pub fn sort_impl(
2113        &self,
2114        by_column: Vec<Column>,
2115        mut sort_options: SortMultipleOptions,
2116        slice: Option<(i64, usize)>,
2117    ) -> PolarsResult<Self> {
2118        if by_column.is_empty() {
2119            // If no columns selected, any order (including original order) is correct.
2120            return if let Some((offset, len)) = slice {
2121                Ok(self.slice(offset, len))
2122            } else {
2123                Ok(self.clone())
2124            };
2125        }
2126
2127        // note that the by_column argument also contains evaluated expression from
2128        // polars-lazy that may not even be present in this dataframe. therefore
2129        // when we try to set the first columns as sorted, we ignore the error as
2130        // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2131        let first_descending = sort_options.descending[0];
2132        let first_by_column = by_column[0].name().to_string();
2133
2134        let set_sorted = |df: &mut DataFrame| {
2135            // Mark the first sort column as sorted; if the column does not exist it
2136            // is ok, because we sorted by an expression not present in the dataframe
2137            let _ = df.apply(&first_by_column, |s| {
2138                let mut s = s.clone();
2139                if first_descending {
2140                    s.set_sorted_flag(IsSorted::Descending)
2141                } else {
2142                    s.set_sorted_flag(IsSorted::Ascending)
2143                }
2144                s
2145            });
2146        };
2147        if self.is_empty() {
2148            let mut out = self.clone();
2149            set_sorted(&mut out);
2150            return Ok(out);
2151        }
2152
2153        if let Some((0, k)) = slice {
2154            if k < self.len() {
2155                return self.bottom_k_impl(k, by_column, sort_options);
2156            }
2157        }
2158        // Check if the required column is already sorted; if so we can exit early
2159        // We can do so when there is only one column to sort by, for multiple columns
2160        // it will be complicated to do so
2161        #[cfg(feature = "dtype-categorical")]
2162        let is_not_categorical_enum =
2163            !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2164                || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2165
2166        #[cfg(not(feature = "dtype-categorical"))]
2167        #[allow(non_upper_case_globals)]
2168        const is_not_categorical_enum: bool = true;
2169
2170        if by_column.len() == 1 && is_not_categorical_enum {
2171            let required_sorting = if sort_options.descending[0] {
2172                IsSorted::Descending
2173            } else {
2174                IsSorted::Ascending
2175            };
2176            // If null count is 0 then nulls_last doesnt matter
2177            // Safe to get value at last position since the dataframe is not empty (taken care above)
2178            let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2179                && ((by_column[0].null_count() == 0)
2180                    || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2181                        == sort_options.nulls_last[0]);
2182
2183            if no_sorting_required {
2184                return if let Some((offset, len)) = slice {
2185                    Ok(self.slice(offset, len))
2186                } else {
2187                    Ok(self.clone())
2188                };
2189            }
2190        }
2191
2192        let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2193
2194        // a lot of indirection in both sorting and take
2195        let mut df = self.clone();
2196        let df = df.as_single_chunk_par();
2197        let mut take = match (by_column.len(), has_nested) {
2198            (1, false) => {
2199                let s = &by_column[0];
2200                let options = SortOptions {
2201                    descending: sort_options.descending[0],
2202                    nulls_last: sort_options.nulls_last[0],
2203                    multithreaded: sort_options.multithreaded,
2204                    maintain_order: sort_options.maintain_order,
2205                    limit: sort_options.limit,
2206                };
2207                // fast path for a frame with a single series
2208                // no need to compute the sort indices and then take by these indices
2209                // simply sort and return as frame
2210                if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2211                    let mut out = s.sort_with(options)?;
2212                    if let Some((offset, len)) = slice {
2213                        out = out.slice(offset, len);
2214                    }
2215                    return Ok(out.into_frame());
2216                }
2217                s.arg_sort(options)
2218            },
2219            _ => {
2220                if sort_options.nulls_last.iter().all(|&x| x)
2221                    || has_nested
2222                    || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2223                {
2224                    argsort_multiple_row_fmt(
2225                        &by_column,
2226                        sort_options.descending,
2227                        sort_options.nulls_last,
2228                        sort_options.multithreaded,
2229                    )?
2230                } else {
2231                    let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2232                    first
2233                        .as_materialized_series()
2234                        .arg_sort_multiple(&other, &sort_options)?
2235                }
2236            },
2237        };
2238
2239        if let Some((offset, len)) = slice {
2240            take = take.slice(offset, len);
2241        }
2242
2243        // SAFETY:
2244        // the created indices are in bounds
2245        let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2246        set_sorted(&mut df);
2247        Ok(df)
2248    }
2249
2250    /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2251    ///
2252    /// This dataframe does not necessarily have a specified schema and may be changed at any
2253    /// point. It is primarily used for debugging.
2254    pub fn _to_metadata(&self) -> DataFrame {
2255        let num_columns = self.columns.len();
2256
2257        let mut column_names =
2258            StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2259        let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2260        let mut sorted_asc_ca =
2261            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2262        let mut sorted_dsc_ca =
2263            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2264        let mut fast_explode_list_ca =
2265            BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2266        let mut materialized_at_ca =
2267            StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2268
2269        for col in &self.columns {
2270            let flags = col.get_flags();
2271
2272            let (repr, materialized_at) = match col {
2273                Column::Series(s) => ("series", s.materialized_at()),
2274                Column::Partitioned(_) => ("partitioned", None),
2275                Column::Scalar(_) => ("scalar", None),
2276            };
2277            let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2278            let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2279            let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2280
2281            column_names.append_value(col.name().clone());
2282            repr_ca.append_value(repr);
2283            sorted_asc_ca.append_value(sorted_asc);
2284            sorted_dsc_ca.append_value(sorted_dsc);
2285            fast_explode_list_ca.append_value(fast_explode_list);
2286            materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2287        }
2288
2289        unsafe {
2290            DataFrame::new_no_checks(
2291                self.width(),
2292                vec![
2293                    column_names.finish().into_column(),
2294                    repr_ca.finish().into_column(),
2295                    sorted_asc_ca.finish().into_column(),
2296                    sorted_dsc_ca.finish().into_column(),
2297                    fast_explode_list_ca.finish().into_column(),
2298                    materialized_at_ca.finish().into_column(),
2299                ],
2300            )
2301        }
2302    }
2303
2304    /// Return a sorted clone of this [`DataFrame`].
2305    ///
2306    /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2307    /// # Example
2308    ///
2309    /// Sort by a single column with default options:
2310    /// ```
2311    /// # use polars_core::prelude::*;
2312    /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2313    ///     df.sort(["sepal_width"], Default::default())
2314    /// }
2315    /// ```
2316    /// Sort by a single column with specific order:
2317    /// ```
2318    /// # use polars_core::prelude::*;
2319    /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2320    ///     df.sort(
2321    ///         ["sepal_width"],
2322    ///         SortMultipleOptions::new()
2323    ///             .with_order_descending(descending)
2324    ///     )
2325    /// }
2326    /// ```
2327    /// Sort by multiple columns with specifying order for each column:
2328    /// ```
2329    /// # use polars_core::prelude::*;
2330    /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2331    ///     df.sort(
2332    ///         ["sepal_width", "sepal_length"],
2333    ///         SortMultipleOptions::new()
2334    ///             .with_order_descending_multi([false, true])
2335    ///     )
2336    /// }
2337    /// ```
2338    /// See [`SortMultipleOptions`] for more options.
2339    ///
2340    /// Also see [`DataFrame::sort_in_place`].
2341    pub fn sort(
2342        &self,
2343        by: impl IntoVec<PlSmallStr>,
2344        sort_options: SortMultipleOptions,
2345    ) -> PolarsResult<Self> {
2346        let mut df = self.clone();
2347        df.sort_in_place(by, sort_options)?;
2348        Ok(df)
2349    }
2350
2351    /// Replace a column with a [`Series`].
2352    ///
2353    /// # Example
2354    ///
2355    /// ```rust
2356    /// # use polars_core::prelude::*;
2357    /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2358    ///                         "Area (km²)" => [9_833_520, 9_596_961])?;
2359    /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2360    ///
2361    /// assert!(df.replace("Nation", s.clone()).is_err());
2362    /// assert!(df.replace("Country", s).is_ok());
2363    /// # Ok::<(), PolarsError>(())
2364    /// ```
2365    pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2366        self.apply(column, |_| new_col.into_series())
2367    }
2368
2369    /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2370    /// is that now the value of `column: &str` determines the name of the column and not the name
2371    /// of the `Series` passed to this method.
2372    pub fn replace_or_add<S: IntoSeries>(
2373        &mut self,
2374        column: PlSmallStr,
2375        new_col: S,
2376    ) -> PolarsResult<&mut Self> {
2377        let mut new_col = new_col.into_series();
2378        new_col.rename(column);
2379        self.with_column(new_col)
2380    }
2381
2382    /// Replace column at index `idx` with a [`Series`].
2383    ///
2384    /// # Example
2385    ///
2386    /// ```ignored
2387    /// # use polars_core::prelude::*;
2388    /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2389    /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2390    /// let mut df = DataFrame::new(vec![s0, s1])?;
2391    ///
2392    /// // Add 32 to get lowercase ascii values
2393    /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2394    /// # Ok::<(), PolarsError>(())
2395    /// ```
2396    pub fn replace_column<C: IntoColumn>(
2397        &mut self,
2398        index: usize,
2399        new_column: C,
2400    ) -> PolarsResult<&mut Self> {
2401        polars_ensure!(
2402            index < self.width(),
2403            ShapeMismatch:
2404            "unable to replace at index {}, the DataFrame has only {} columns",
2405            index, self.width(),
2406        );
2407        let mut new_column = new_column.into_column();
2408        polars_ensure!(
2409            new_column.len() == self.height(),
2410            ShapeMismatch:
2411            "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2412            new_column.len(), self.height(),
2413        );
2414        let old_col = &mut self.columns[index];
2415        mem::swap(old_col, &mut new_column);
2416        self.clear_schema();
2417        Ok(self)
2418    }
2419
2420    /// Apply a closure to a column. This is the recommended way to do in place modification.
2421    ///
2422    /// # Example
2423    ///
2424    /// ```rust
2425    /// # use polars_core::prelude::*;
2426    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2427    /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2428    /// let mut df = DataFrame::new(vec![s0, s1])?;
2429    ///
2430    /// fn str_to_len(str_val: &Column) -> Column {
2431    ///     str_val.str()
2432    ///         .unwrap()
2433    ///         .into_iter()
2434    ///         .map(|opt_name: Option<&str>| {
2435    ///             opt_name.map(|name: &str| name.len() as u32)
2436    ///          })
2437    ///         .collect::<UInt32Chunked>()
2438    ///         .into_column()
2439    /// }
2440    ///
2441    /// // Replace the names column by the length of the names.
2442    /// df.apply("names", str_to_len);
2443    /// # Ok::<(), PolarsError>(())
2444    /// ```
2445    /// Results in:
2446    ///
2447    /// ```text
2448    /// +--------+-------+
2449    /// | foo    |       |
2450    /// | ---    | names |
2451    /// | str    | u32   |
2452    /// +========+=======+
2453    /// | "ham"  | 4     |
2454    /// +--------+-------+
2455    /// | "spam" | 6     |
2456    /// +--------+-------+
2457    /// | "egg"  | 3     |
2458    /// +--------+-------+
2459    /// ```
2460    pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2461    where
2462        F: FnOnce(&Column) -> C,
2463        C: IntoColumn,
2464    {
2465        let idx = self.check_name_to_idx(name)?;
2466        self.apply_at_idx(idx, f)?;
2467        Ok(self)
2468    }
2469
2470    /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2471    /// modification.
2472    ///
2473    /// # Example
2474    ///
2475    /// ```rust
2476    /// # use polars_core::prelude::*;
2477    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2478    /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2479    /// let mut df = DataFrame::new(vec![s0, s1])?;
2480    ///
2481    /// // Add 32 to get lowercase ascii values
2482    /// df.apply_at_idx(1, |s| s + 32);
2483    /// # Ok::<(), PolarsError>(())
2484    /// ```
2485    /// Results in:
2486    ///
2487    /// ```text
2488    /// +--------+-------+
2489    /// | foo    | ascii |
2490    /// | ---    | ---   |
2491    /// | str    | i32   |
2492    /// +========+=======+
2493    /// | "ham"  | 102   |
2494    /// +--------+-------+
2495    /// | "spam" | 111   |
2496    /// +--------+-------+
2497    /// | "egg"  | 111   |
2498    /// +--------+-------+
2499    /// ```
2500    pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2501    where
2502        F: FnOnce(&Column) -> C,
2503        C: IntoColumn,
2504    {
2505        let df_height = self.height();
2506        let width = self.width();
2507        let col = self.columns.get_mut(idx).ok_or_else(|| {
2508            polars_err!(
2509                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2510                idx, width
2511            )
2512        })?;
2513        let name = col.name().clone();
2514        let dtype_before = col.dtype().clone();
2515        let new_col = f(col).into_column();
2516        match new_col.len() {
2517            1 => {
2518                let new_col = new_col.new_from_index(0, df_height);
2519                let _ = mem::replace(col, new_col);
2520            },
2521            len if (len == df_height) => {
2522                let _ = mem::replace(col, new_col);
2523            },
2524            len => polars_bail!(
2525                ShapeMismatch:
2526                "resulting Series has length {} while the DataFrame has height {}",
2527                len, df_height
2528            ),
2529        }
2530
2531        // make sure the name remains the same after applying the closure
2532        unsafe {
2533            let col = self.columns.get_unchecked_mut(idx);
2534            col.rename(name);
2535
2536            if col.dtype() != &dtype_before {
2537                self.clear_schema();
2538            }
2539        }
2540        Ok(self)
2541    }
2542
2543    /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2544    /// modification.
2545    ///
2546    /// # Example
2547    ///
2548    /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2549    ///
2550    /// ```rust
2551    /// # use polars_core::prelude::*;
2552    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2553    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2554    /// let mut df = DataFrame::new(vec![s0, s1])?;
2555    ///
2556    /// let idx = vec![0, 1, 4];
2557    ///
2558    /// df.try_apply("foo", |c| {
2559    ///     c.str()?
2560    ///     .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2561    /// });
2562    /// # Ok::<(), PolarsError>(())
2563    /// ```
2564    /// Results in:
2565    ///
2566    /// ```text
2567    /// +---------------------+--------+
2568    /// | foo                 | values |
2569    /// | ---                 | ---    |
2570    /// | str                 | i32    |
2571    /// +=====================+========+
2572    /// | "ham-is-modified"   | 1      |
2573    /// +---------------------+--------+
2574    /// | "spam-is-modified"  | 2      |
2575    /// +---------------------+--------+
2576    /// | "egg"               | 3      |
2577    /// +---------------------+--------+
2578    /// | "bacon"             | 4      |
2579    /// +---------------------+--------+
2580    /// | "quack-is-modified" | 5      |
2581    /// +---------------------+--------+
2582    /// ```
2583    pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2584    where
2585        F: FnOnce(&Column) -> PolarsResult<C>,
2586        C: IntoColumn,
2587    {
2588        let width = self.width();
2589        let col = self.columns.get_mut(idx).ok_or_else(|| {
2590            polars_err!(
2591                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2592                idx, width
2593            )
2594        })?;
2595        let name = col.name().clone();
2596
2597        let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2598
2599        // make sure the name remains the same after applying the closure
2600        unsafe {
2601            let col = self.columns.get_unchecked_mut(idx);
2602            col.rename(name);
2603        }
2604        Ok(self)
2605    }
2606
2607    /// Apply a closure that may fail to a column. This is the recommended way to do in place
2608    /// modification.
2609    ///
2610    /// # Example
2611    ///
2612    /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2613    ///
2614    /// ```rust
2615    /// # use polars_core::prelude::*;
2616    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2617    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2618    /// let mut df = DataFrame::new(vec![s0, s1])?;
2619    ///
2620    /// // create a mask
2621    /// let values = df.column("values")?.as_materialized_series();
2622    /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2623    ///
2624    /// df.try_apply("foo", |c| {
2625    ///     c.str()?
2626    ///     .set(&mask, Some("not_within_bounds"))
2627    /// });
2628    /// # Ok::<(), PolarsError>(())
2629    /// ```
2630    /// Results in:
2631    ///
2632    /// ```text
2633    /// +---------------------+--------+
2634    /// | foo                 | values |
2635    /// | ---                 | ---    |
2636    /// | str                 | i32    |
2637    /// +=====================+========+
2638    /// | "not_within_bounds" | 1      |
2639    /// +---------------------+--------+
2640    /// | "spam"              | 2      |
2641    /// +---------------------+--------+
2642    /// | "egg"               | 3      |
2643    /// +---------------------+--------+
2644    /// | "bacon"             | 4      |
2645    /// +---------------------+--------+
2646    /// | "not_within_bounds" | 5      |
2647    /// +---------------------+--------+
2648    /// ```
2649    pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2650    where
2651        F: FnOnce(&Series) -> PolarsResult<C>,
2652        C: IntoColumn,
2653    {
2654        let idx = self.try_get_column_index(column)?;
2655        self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2656    }
2657
2658    /// Slice the [`DataFrame`] along the rows.
2659    ///
2660    /// # Example
2661    ///
2662    /// ```rust
2663    /// # use polars_core::prelude::*;
2664    /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2665    ///                         "Color" => ["Green", "Red", "White", "White", "Red"])?;
2666    /// let sl: DataFrame = df.slice(2, 3);
2667    ///
2668    /// assert_eq!(sl.shape(), (3, 2));
2669    /// println!("{}", sl);
2670    /// # Ok::<(), PolarsError>(())
2671    /// ```
2672    /// Output:
2673    /// ```text
2674    /// shape: (3, 2)
2675    /// +-------+-------+
2676    /// | Fruit | Color |
2677    /// | ---   | ---   |
2678    /// | str   | str   |
2679    /// +=======+=======+
2680    /// | Grape | White |
2681    /// +-------+-------+
2682    /// | Fig   | White |
2683    /// +-------+-------+
2684    /// | Fig   | Red   |
2685    /// +-------+-------+
2686    /// ```
2687    #[must_use]
2688    pub fn slice(&self, offset: i64, length: usize) -> Self {
2689        if offset == 0 && length == self.height() {
2690            return self.clone();
2691        }
2692        if length == 0 {
2693            return self.clear();
2694        }
2695        let col = self
2696            .columns
2697            .iter()
2698            .map(|s| s.slice(offset, length))
2699            .collect::<Vec<_>>();
2700
2701        let height = if let Some(fst) = col.first() {
2702            fst.len()
2703        } else {
2704            let (_, length) = slice_offsets(offset, length, self.height());
2705            length
2706        };
2707
2708        unsafe { DataFrame::new_no_checks(height, col) }
2709    }
2710
2711    /// Split [`DataFrame`] at the given `offset`.
2712    pub fn split_at(&self, offset: i64) -> (Self, Self) {
2713        let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2714
2715        let (idx, _) = slice_offsets(offset, 0, self.height());
2716
2717        let a = unsafe { DataFrame::new_no_checks(idx, a) };
2718        let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2719        (a, b)
2720    }
2721
2722    pub fn clear(&self) -> Self {
2723        let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2724        unsafe { DataFrame::new_no_checks(0, col) }
2725    }
2726
2727    #[must_use]
2728    pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2729        if offset == 0 && length == self.height() {
2730            return self.clone();
2731        }
2732        let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2733        unsafe { DataFrame::new_no_checks(length, columns) }
2734    }
2735
2736    #[must_use]
2737    pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2738        if offset == 0 && length == self.height() {
2739            return self.clone();
2740        }
2741        // @scalar-opt
2742        let columns = self._apply_columns(&|s| {
2743            let mut out = s.slice(offset, length);
2744            out.shrink_to_fit();
2745            out
2746        });
2747        unsafe { DataFrame::new_no_checks(length, columns) }
2748    }
2749
2750    /// Get the head of the [`DataFrame`].
2751    ///
2752    /// # Example
2753    ///
2754    /// ```rust
2755    /// # use polars_core::prelude::*;
2756    /// let countries: DataFrame =
2757    ///     df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2758    ///         "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2759    ///         "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2760    ///         "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2761    /// assert_eq!(countries.shape(), (5, 4));
2762    ///
2763    /// println!("{}", countries.head(Some(3)));
2764    /// # Ok::<(), PolarsError>(())
2765    /// ```
2766    ///
2767    /// Output:
2768    ///
2769    /// ```text
2770    /// shape: (3, 4)
2771    /// +--------------------+---------------+---------------+------------+
2772    /// | Rank by GDP (2021) | Continent     | Country       | Capital    |
2773    /// | ---                | ---           | ---           | ---        |
2774    /// | i32                | str           | str           | str        |
2775    /// +====================+===============+===============+============+
2776    /// | 1                  | North America | United States | Washington |
2777    /// +--------------------+---------------+---------------+------------+
2778    /// | 2                  | Asia          | China         | Beijing    |
2779    /// +--------------------+---------------+---------------+------------+
2780    /// | 3                  | Asia          | Japan         | Tokyo      |
2781    /// +--------------------+---------------+---------------+------------+
2782    /// ```
2783    #[must_use]
2784    pub fn head(&self, length: Option<usize>) -> Self {
2785        let col = self
2786            .columns
2787            .iter()
2788            .map(|c| c.head(length))
2789            .collect::<Vec<_>>();
2790
2791        let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2792        let height = usize::min(height, self.height());
2793        unsafe { DataFrame::new_no_checks(height, col) }
2794    }
2795
2796    /// Get the tail of the [`DataFrame`].
2797    ///
2798    /// # Example
2799    ///
2800    /// ```rust
2801    /// # use polars_core::prelude::*;
2802    /// let countries: DataFrame =
2803    ///     df!("Rank (2021)" => [105, 106, 107, 108, 109],
2804    ///         "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2805    ///         "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2806    /// assert_eq!(countries.shape(), (5, 3));
2807    ///
2808    /// println!("{}", countries.tail(Some(2)));
2809    /// # Ok::<(), PolarsError>(())
2810    /// ```
2811    ///
2812    /// Output:
2813    ///
2814    /// ```text
2815    /// shape: (2, 3)
2816    /// +-------------+--------------------+---------+
2817    /// | Rank (2021) | Apple Price (€/kg) | Country |
2818    /// | ---         | ---                | ---     |
2819    /// | i32         | f64                | str     |
2820    /// +=============+====================+=========+
2821    /// | 108         | 0.63               | Syria   |
2822    /// +-------------+--------------------+---------+
2823    /// | 109         | 0.63               | Turkey  |
2824    /// +-------------+--------------------+---------+
2825    /// ```
2826    #[must_use]
2827    pub fn tail(&self, length: Option<usize>) -> Self {
2828        let col = self
2829            .columns
2830            .iter()
2831            .map(|c| c.tail(length))
2832            .collect::<Vec<_>>();
2833
2834        let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2835        let height = usize::min(height, self.height());
2836        unsafe { DataFrame::new_no_checks(height, col) }
2837    }
2838
2839    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2840    ///
2841    /// # Panics
2842    ///
2843    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2844    ///
2845    /// This responsibility is left to the caller as we don't want to take mutable references here,
2846    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2847    /// as well.
2848    pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter<'_> {
2849        debug_assert!(!self.should_rechunk(), "expected equal chunks");
2850        // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2851        // as we must allocate arrow strings/binaries.
2852        let must_convert = compat_level.0 == 0;
2853        let parallel = parallel
2854            && must_convert
2855            && self.columns.len() > 1
2856            && self
2857                .columns
2858                .iter()
2859                .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2860
2861        RecordBatchIter {
2862            columns: &self.columns,
2863            schema: Arc::new(
2864                self.columns
2865                    .iter()
2866                    .map(|c| c.field().to_arrow(compat_level))
2867                    .collect(),
2868            ),
2869            idx: 0,
2870            n_chunks: self.first_col_n_chunks(),
2871            compat_level,
2872            parallel,
2873        }
2874    }
2875
2876    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2877    ///
2878    /// # Panics
2879    ///
2880    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2881    ///
2882    /// This responsibility is left to the caller as we don't want to take mutable references here,
2883    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2884    /// as well.
2885    pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2886        debug_assert!(!self.should_rechunk());
2887        PhysRecordBatchIter {
2888            schema: Arc::new(
2889                self.get_columns()
2890                    .iter()
2891                    .map(|c| c.field().to_arrow(CompatLevel::newest()))
2892                    .collect(),
2893            ),
2894            arr_iters: self
2895                .materialized_column_iter()
2896                .map(|s| s.chunks().iter())
2897                .collect(),
2898        }
2899    }
2900
2901    /// Get a [`DataFrame`] with all the columns in reversed order.
2902    #[must_use]
2903    pub fn reverse(&self) -> Self {
2904        let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2905        unsafe { DataFrame::new_no_checks(self.height(), col) }
2906    }
2907
2908    /// Shift the values by a given period and fill the parts that will be empty due to this operation
2909    /// with `Nones`.
2910    ///
2911    /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2912    #[must_use]
2913    pub fn shift(&self, periods: i64) -> Self {
2914        let col = self._apply_columns_par(&|s| s.shift(periods));
2915        unsafe { DataFrame::new_no_checks(self.height(), col) }
2916    }
2917
2918    /// Replace None values with one of the following strategies:
2919    /// * Forward fill (replace None with the previous value)
2920    /// * Backward fill (replace None with the next value)
2921    /// * Mean fill (replace None with the mean of the whole array)
2922    /// * Min fill (replace None with the minimum of the whole array)
2923    /// * Max fill (replace None with the maximum of the whole array)
2924    ///
2925    /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2926    pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2927        let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2928
2929        Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2930    }
2931
2932    /// Pipe different functions/ closure operations that work on a DataFrame together.
2933    pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2934    where
2935        F: Fn(DataFrame) -> PolarsResult<B>,
2936    {
2937        f(self)
2938    }
2939
2940    /// Pipe different functions/ closure operations that work on a DataFrame together.
2941    pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2942    where
2943        F: Fn(&mut DataFrame) -> PolarsResult<B>,
2944    {
2945        f(self)
2946    }
2947
2948    /// Pipe different functions/ closure operations that work on a DataFrame together.
2949    pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2950    where
2951        F: Fn(DataFrame, Args) -> PolarsResult<B>,
2952    {
2953        f(self, args)
2954    }
2955
2956    /// Drop duplicate rows from a [`DataFrame`].
2957    /// *This fails when there is a column of type List in DataFrame*
2958    ///
2959    /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2960    ///
2961    /// # Example
2962    ///
2963    /// ```no_run
2964    /// # use polars_core::prelude::*;
2965    /// let df = df! {
2966    ///               "flt" => [1., 1., 2., 2., 3., 3.],
2967    ///               "int" => [1, 1, 2, 2, 3, 3, ],
2968    ///               "str" => ["a", "a", "b", "b", "c", "c"]
2969    ///           }?;
2970    ///
2971    /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2972    /// # Ok::<(), PolarsError>(())
2973    /// ```
2974    /// Returns
2975    ///
2976    /// ```text
2977    /// +-----+-----+-----+
2978    /// | flt | int | str |
2979    /// | --- | --- | --- |
2980    /// | f64 | i32 | str |
2981    /// +=====+=====+=====+
2982    /// | 1   | 1   | "a" |
2983    /// +-----+-----+-----+
2984    /// | 2   | 2   | "b" |
2985    /// +-----+-----+-----+
2986    /// | 3   | 3   | "c" |
2987    /// +-----+-----+-----+
2988    /// ```
2989    #[cfg(feature = "algorithm_group_by")]
2990    pub fn unique_stable(
2991        &self,
2992        subset: Option<&[String]>,
2993        keep: UniqueKeepStrategy,
2994        slice: Option<(i64, usize)>,
2995    ) -> PolarsResult<DataFrame> {
2996        self.unique_impl(
2997            true,
2998            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2999            keep,
3000            slice,
3001        )
3002    }
3003
3004    /// Unstable distinct. See [`DataFrame::unique_stable`].
3005    #[cfg(feature = "algorithm_group_by")]
3006    pub fn unique<I, S>(
3007        &self,
3008        subset: Option<&[String]>,
3009        keep: UniqueKeepStrategy,
3010        slice: Option<(i64, usize)>,
3011    ) -> PolarsResult<DataFrame> {
3012        self.unique_impl(
3013            false,
3014            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3015            keep,
3016            slice,
3017        )
3018    }
3019
3020    #[cfg(feature = "algorithm_group_by")]
3021    pub fn unique_impl(
3022        &self,
3023        maintain_order: bool,
3024        subset: Option<Vec<PlSmallStr>>,
3025        keep: UniqueKeepStrategy,
3026        slice: Option<(i64, usize)>,
3027    ) -> PolarsResult<Self> {
3028        let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3029        let mut df = self.clone();
3030        // take on multiple chunks is terrible
3031        df.as_single_chunk_par();
3032
3033        let columns = match (keep, maintain_order) {
3034            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3035                let gb = df.group_by_stable(names)?;
3036                let groups = gb.get_groups();
3037                let (offset, len) = slice.unwrap_or((0, groups.len()));
3038                let groups = groups.slice(offset, len);
3039                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3040            },
3041            (UniqueKeepStrategy::Last, true) => {
3042                // maintain order by last values, so the sorted groups are not correct as they
3043                // are sorted by the first value
3044                let gb = df.group_by_stable(names)?;
3045                let groups = gb.get_groups();
3046
3047                let last_idx: NoNull<IdxCa> = groups
3048                    .iter()
3049                    .map(|g| match g {
3050                        GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3051                        GroupsIndicator::Slice([first, len]) => first + len - 1,
3052                    })
3053                    .collect();
3054
3055                let mut last_idx = last_idx.into_inner().sort(false);
3056
3057                if let Some((offset, len)) = slice {
3058                    last_idx = last_idx.slice(offset, len);
3059                }
3060
3061                let last_idx = NoNull::new(last_idx);
3062                let out = unsafe { df.take_unchecked(&last_idx) };
3063                return Ok(out);
3064            },
3065            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3066                let gb = df.group_by(names)?;
3067                let groups = gb.get_groups();
3068                let (offset, len) = slice.unwrap_or((0, groups.len()));
3069                let groups = groups.slice(offset, len);
3070                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3071            },
3072            (UniqueKeepStrategy::Last, false) => {
3073                let gb = df.group_by(names)?;
3074                let groups = gb.get_groups();
3075                let (offset, len) = slice.unwrap_or((0, groups.len()));
3076                let groups = groups.slice(offset, len);
3077                df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3078            },
3079            (UniqueKeepStrategy::None, _) => {
3080                let df_part = df.select(names)?;
3081                let mask = df_part.is_unique()?;
3082                let mut filtered = df.filter(&mask)?;
3083
3084                if let Some((offset, len)) = slice {
3085                    filtered = filtered.slice(offset, len);
3086                }
3087                return Ok(filtered);
3088            },
3089        };
3090        let height = Self::infer_height(&columns);
3091        Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3092    }
3093
3094    /// Get a mask of all the unique rows in the [`DataFrame`].
3095    ///
3096    /// # Example
3097    ///
3098    /// ```no_run
3099    /// # use polars_core::prelude::*;
3100    /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3101    ///                         "ISIN" => ["US0378331005", "US5949181045"])?;
3102    /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3103    ///
3104    /// assert!(ca.all());
3105    /// # Ok::<(), PolarsError>(())
3106    /// ```
3107    #[cfg(feature = "algorithm_group_by")]
3108    pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3109        let gb = self.group_by(self.get_column_names_owned())?;
3110        let groups = gb.get_groups();
3111        Ok(is_unique_helper(
3112            groups,
3113            self.height() as IdxSize,
3114            true,
3115            false,
3116        ))
3117    }
3118
3119    /// Get a mask of all the duplicated rows in the [`DataFrame`].
3120    ///
3121    /// # Example
3122    ///
3123    /// ```no_run
3124    /// # use polars_core::prelude::*;
3125    /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3126    ///                         "ISIN" => ["US02079K3059", "US02079K1079"])?;
3127    /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3128    ///
3129    /// assert!(!ca.all());
3130    /// # Ok::<(), PolarsError>(())
3131    /// ```
3132    #[cfg(feature = "algorithm_group_by")]
3133    pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3134        let gb = self.group_by(self.get_column_names_owned())?;
3135        let groups = gb.get_groups();
3136        Ok(is_unique_helper(
3137            groups,
3138            self.height() as IdxSize,
3139            false,
3140            true,
3141        ))
3142    }
3143
3144    /// Create a new [`DataFrame`] that shows the null counts per column.
3145    #[must_use]
3146    pub fn null_count(&self) -> Self {
3147        let cols = self
3148            .columns
3149            .iter()
3150            .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3151            .collect();
3152        unsafe { Self::new_no_checks(1, cols) }
3153    }
3154
3155    /// Hash and combine the row values
3156    #[cfg(feature = "row_hash")]
3157    pub fn hash_rows(
3158        &mut self,
3159        hasher_builder: Option<PlSeedableRandomStateQuality>,
3160    ) -> PolarsResult<UInt64Chunked> {
3161        let dfs = split_df(self, POOL.current_num_threads(), false);
3162        let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3163
3164        let mut iter = cas.into_iter();
3165        let mut acc_ca = iter.next().unwrap();
3166        for ca in iter {
3167            acc_ca.append(&ca)?;
3168        }
3169        Ok(acc_ca.rechunk().into_owned())
3170    }
3171
3172    /// Get the supertype of the columns in this DataFrame
3173    pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3174        self.columns
3175            .iter()
3176            .map(|s| Ok(s.dtype().clone()))
3177            .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3178    }
3179
3180    /// Take by index values given by the slice `idx`.
3181    /// # Warning
3182    /// Be careful with allowing threads when calling this in a large hot loop
3183    /// every thread split may be on rayon stack and lead to SO
3184    #[doc(hidden)]
3185    pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3186        self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3187    }
3188
3189    /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3190    /// if the index value in `idx` are sorted. This will maintain sorted flags.
3191    ///
3192    /// # Warning
3193    /// Be careful with allowing threads when calling this in a large hot loop
3194    /// every thread split may be on rayon stack and lead to SO
3195    #[doc(hidden)]
3196    pub unsafe fn _take_unchecked_slice_sorted(
3197        &self,
3198        idx: &[IdxSize],
3199        allow_threads: bool,
3200        sorted: IsSorted,
3201    ) -> Self {
3202        #[cfg(debug_assertions)]
3203        {
3204            if idx.len() > 2 {
3205                match sorted {
3206                    IsSorted::Ascending => {
3207                        assert!(idx[0] <= idx[idx.len() - 1]);
3208                    },
3209                    IsSorted::Descending => {
3210                        assert!(idx[0] >= idx[idx.len() - 1]);
3211                    },
3212                    _ => {},
3213                }
3214            }
3215        }
3216        let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3217        ca.set_sorted_flag(sorted);
3218        self.take_unchecked_impl(&ca, allow_threads)
3219    }
3220
3221    #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3222    #[doc(hidden)]
3223    pub fn _partition_by_impl(
3224        &self,
3225        cols: &[PlSmallStr],
3226        stable: bool,
3227        include_key: bool,
3228        parallel: bool,
3229    ) -> PolarsResult<Vec<DataFrame>> {
3230        let selected_keys = self.select_columns(cols.iter().cloned())?;
3231        let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3232        let groups = groups.take_groups();
3233
3234        // drop key columns prior to calculation if requested
3235        let df = if include_key {
3236            self.clone()
3237        } else {
3238            self.drop_many(cols.iter().cloned())
3239        };
3240
3241        if parallel {
3242            // don't parallelize this
3243            // there is a lot of parallelization in take and this may easily SO
3244            POOL.install(|| {
3245                match groups.as_ref() {
3246                    GroupsType::Idx(idx) => {
3247                        // Rechunk as the gather may rechunk for every group #17562.
3248                        let mut df = df.clone();
3249                        df.as_single_chunk_par();
3250                        Ok(idx
3251                            .into_par_iter()
3252                            .map(|(_, group)| {
3253                                // groups are in bounds
3254                                unsafe {
3255                                    df._take_unchecked_slice_sorted(
3256                                        group,
3257                                        false,
3258                                        IsSorted::Ascending,
3259                                    )
3260                                }
3261                            })
3262                            .collect())
3263                    },
3264                    GroupsType::Slice { groups, .. } => Ok(groups
3265                        .into_par_iter()
3266                        .map(|[first, len]| df.slice(*first as i64, *len as usize))
3267                        .collect()),
3268                }
3269            })
3270        } else {
3271            match groups.as_ref() {
3272                GroupsType::Idx(idx) => {
3273                    // Rechunk as the gather may rechunk for every group #17562.
3274                    let mut df = df;
3275                    df.as_single_chunk();
3276                    Ok(idx
3277                        .into_iter()
3278                        .map(|(_, group)| {
3279                            // groups are in bounds
3280                            unsafe {
3281                                df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3282                            }
3283                        })
3284                        .collect())
3285                },
3286                GroupsType::Slice { groups, .. } => Ok(groups
3287                    .iter()
3288                    .map(|[first, len]| df.slice(*first as i64, *len as usize))
3289                    .collect()),
3290            }
3291        }
3292    }
3293
3294    /// Split into multiple DataFrames partitioned by groups
3295    #[cfg(feature = "partition_by")]
3296    pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3297    where
3298        I: IntoIterator<Item = S>,
3299        S: Into<PlSmallStr>,
3300    {
3301        let cols = cols
3302            .into_iter()
3303            .map(Into::into)
3304            .collect::<Vec<PlSmallStr>>();
3305        self._partition_by_impl(cols.as_slice(), false, include_key, true)
3306    }
3307
3308    /// Split into multiple DataFrames partitioned by groups
3309    /// Order of the groups are maintained.
3310    #[cfg(feature = "partition_by")]
3311    pub fn partition_by_stable<I, S>(
3312        &self,
3313        cols: I,
3314        include_key: bool,
3315    ) -> PolarsResult<Vec<DataFrame>>
3316    where
3317        I: IntoIterator<Item = S>,
3318        S: Into<PlSmallStr>,
3319    {
3320        let cols = cols
3321            .into_iter()
3322            .map(Into::into)
3323            .collect::<Vec<PlSmallStr>>();
3324        self._partition_by_impl(cols.as_slice(), true, include_key, true)
3325    }
3326
3327    /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3328    /// inserted as columns.
3329    #[cfg(feature = "dtype-struct")]
3330    pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {
3331        let cols = cols.into_vec();
3332        self.unnest_impl(cols.into_iter().collect())
3333    }
3334
3335    #[cfg(feature = "dtype-struct")]
3336    fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {
3337        let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3338        let mut count = 0;
3339        for s in &self.columns {
3340            if cols.contains(s.name()) {
3341                let ca = s.struct_()?.clone();
3342                new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));
3343                count += 1;
3344            } else {
3345                new_cols.push(s.clone())
3346            }
3347        }
3348        if count != cols.len() {
3349            // one or more columns not found
3350            // the code below will return an error with the missing name
3351            let schema = self.schema();
3352            for col in cols {
3353                let _ = schema
3354                    .get(col.as_str())
3355                    .ok_or_else(|| polars_err!(col_not_found = col))?;
3356            }
3357        }
3358        DataFrame::new(new_cols)
3359    }
3360
3361    pub(crate) fn infer_height(cols: &[Column]) -> usize {
3362        cols.first().map_or(0, Column::len)
3363    }
3364
3365    pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3366        // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3367        // append_chunk or something like this. It is just quite difficult to make that safe.
3368        let df = DataFrame::from(rb);
3369        polars_ensure!(
3370            self.schema() == df.schema(),
3371            SchemaMismatch: "cannot append record batch with different schema\n\n
3372        Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
3373        );
3374        self.vstack_mut_owned_unchecked(df);
3375        Ok(())
3376    }
3377}
3378
3379pub struct RecordBatchIter<'a> {
3380    columns: &'a Vec<Column>,
3381    schema: ArrowSchemaRef,
3382    idx: usize,
3383    n_chunks: usize,
3384    compat_level: CompatLevel,
3385    parallel: bool,
3386}
3387
3388impl Iterator for RecordBatchIter<'_> {
3389    type Item = RecordBatch;
3390
3391    fn next(&mut self) -> Option<Self::Item> {
3392        if self.idx >= self.n_chunks {
3393            return None;
3394        }
3395
3396        // Create a batch of the columns with the same chunk no.
3397        let batch_cols: Vec<ArrayRef> = if self.parallel {
3398            let iter = self
3399                .columns
3400                .par_iter()
3401                .map(Column::as_materialized_series)
3402                .map(|s| s.to_arrow(self.idx, self.compat_level));
3403            POOL.install(|| iter.collect())
3404        } else {
3405            self.columns
3406                .iter()
3407                .map(Column::as_materialized_series)
3408                .map(|s| s.to_arrow(self.idx, self.compat_level))
3409                .collect()
3410        };
3411        self.idx += 1;
3412
3413        let length = batch_cols.first().map_or(0, |arr| arr.len());
3414        Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3415    }
3416
3417    fn size_hint(&self) -> (usize, Option<usize>) {
3418        let n = self.n_chunks - self.idx;
3419        (n, Some(n))
3420    }
3421}
3422
3423pub struct PhysRecordBatchIter<'a> {
3424    schema: ArrowSchemaRef,
3425    arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3426}
3427
3428impl Iterator for PhysRecordBatchIter<'_> {
3429    type Item = RecordBatch;
3430
3431    fn next(&mut self) -> Option<Self::Item> {
3432        let arrs = self
3433            .arr_iters
3434            .iter_mut()
3435            .map(|phys_iter| phys_iter.next().cloned())
3436            .collect::<Option<Vec<_>>>()?;
3437
3438        let length = arrs.first().map_or(0, |arr| arr.len());
3439        Some(RecordBatch::new(length, self.schema.clone(), arrs))
3440    }
3441
3442    fn size_hint(&self) -> (usize, Option<usize>) {
3443        if let Some(iter) = self.arr_iters.first() {
3444            iter.size_hint()
3445        } else {
3446            (0, None)
3447        }
3448    }
3449}
3450
3451impl Default for DataFrame {
3452    fn default() -> Self {
3453        DataFrame::empty()
3454    }
3455}
3456
3457impl From<DataFrame> for Vec<Column> {
3458    fn from(df: DataFrame) -> Self {
3459        df.columns
3460    }
3461}
3462
3463// utility to test if we can vstack/extend the columns
3464fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3465    polars_ensure!(
3466        left.name() == right.name(),
3467        ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3468        left.name(), right.name(),
3469    );
3470    Ok(())
3471}
3472
3473#[cfg(test)]
3474mod test {
3475    use super::*;
3476
3477    fn create_frame() -> DataFrame {
3478        let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3479        let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3480        DataFrame::new(vec![s0, s1]).unwrap()
3481    }
3482
3483    #[test]
3484    #[cfg_attr(miri, ignore)]
3485    fn test_recordbatch_iterator() {
3486        let df = df!(
3487            "foo" => [1, 2, 3, 4, 5]
3488        )
3489        .unwrap();
3490        let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3491        assert_eq!(5, iter.next().unwrap().len());
3492        assert!(iter.next().is_none());
3493    }
3494
3495    #[test]
3496    #[cfg_attr(miri, ignore)]
3497    fn test_select() {
3498        let df = create_frame();
3499        assert_eq!(
3500            df.column("days")
3501                .unwrap()
3502                .as_series()
3503                .unwrap()
3504                .equal(1)
3505                .unwrap()
3506                .sum(),
3507            Some(1)
3508        );
3509    }
3510
3511    #[test]
3512    #[cfg_attr(miri, ignore)]
3513    fn test_filter_broadcast_on_string_col() {
3514        let col_name = "some_col";
3515        let v = vec!["test".to_string()];
3516        let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3517        let mut df = DataFrame::new(vec![s0]).unwrap();
3518
3519        df = df
3520            .filter(
3521                &df.column(col_name)
3522                    .unwrap()
3523                    .as_materialized_series()
3524                    .equal("")
3525                    .unwrap(),
3526            )
3527            .unwrap();
3528        assert_eq!(
3529            df.column(col_name)
3530                .unwrap()
3531                .as_materialized_series()
3532                .n_chunks(),
3533            1
3534        );
3535    }
3536
3537    #[test]
3538    #[cfg_attr(miri, ignore)]
3539    fn test_filter_broadcast_on_list_col() {
3540        let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3541        let ll: ListChunked = [&s1].iter().copied().collect();
3542
3543        let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3544        let new = ll.filter(&mask).unwrap();
3545
3546        assert_eq!(new.chunks.len(), 1);
3547        assert_eq!(new.len(), 0);
3548    }
3549
3550    #[test]
3551    fn slice() {
3552        let df = create_frame();
3553        let sliced_df = df.slice(0, 2);
3554        assert_eq!(sliced_df.shape(), (2, 2));
3555    }
3556
3557    #[test]
3558    fn rechunk_false() {
3559        let df = create_frame();
3560        assert!(!df.should_rechunk())
3561    }
3562
3563    #[test]
3564    fn rechunk_true() -> PolarsResult<()> {
3565        let mut base = df!(
3566            "a" => [1, 2, 3],
3567            "b" => [1, 2, 3]
3568        )?;
3569
3570        // Create a series with multiple chunks
3571        let mut s = Series::new("foo".into(), 0..2);
3572        let s2 = Series::new("bar".into(), 0..1);
3573        s.append(&s2)?;
3574
3575        // Append series to frame
3576        let out = base.with_column(s)?;
3577
3578        // Now we should rechunk
3579        assert!(out.should_rechunk());
3580        Ok(())
3581    }
3582
3583    #[test]
3584    fn test_duplicate_column() {
3585        let mut df = df! {
3586            "foo" => [1, 2, 3]
3587        }
3588        .unwrap();
3589        // check if column is replaced
3590        assert!(
3591            df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3592                .is_ok()
3593        );
3594        assert!(
3595            df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3596                .is_ok()
3597        );
3598        assert!(df.column("bar").is_ok())
3599    }
3600
3601    #[test]
3602    #[cfg_attr(miri, ignore)]
3603    fn distinct() {
3604        let df = df! {
3605            "flt" => [1., 1., 2., 2., 3., 3.],
3606            "int" => [1, 1, 2, 2, 3, 3, ],
3607            "str" => ["a", "a", "b", "b", "c", "c"]
3608        }
3609        .unwrap();
3610        let df = df
3611            .unique_stable(None, UniqueKeepStrategy::First, None)
3612            .unwrap()
3613            .sort(["flt"], SortMultipleOptions::default())
3614            .unwrap();
3615        let valid = df! {
3616            "flt" => [1., 2., 3.],
3617            "int" => [1, 2, 3],
3618            "str" => ["a", "b", "c"]
3619        }
3620        .unwrap();
3621        assert!(df.equals(&valid));
3622    }
3623
3624    #[test]
3625    fn test_vstack() {
3626        // check that it does not accidentally rechunks
3627        let mut df = df! {
3628            "flt" => [1., 1., 2., 2., 3., 3.],
3629            "int" => [1, 1, 2, 2, 3, 3, ],
3630            "str" => ["a", "a", "b", "b", "c", "c"]
3631        }
3632        .unwrap();
3633
3634        df.vstack_mut(&df.slice(0, 3)).unwrap();
3635        assert_eq!(df.first_col_n_chunks(), 2)
3636    }
3637
3638    #[test]
3639    fn test_vstack_on_empty_dataframe() {
3640        let mut df = DataFrame::empty();
3641
3642        let df_data = df! {
3643            "flt" => [1., 1., 2., 2., 3., 3.],
3644            "int" => [1, 1, 2, 2, 3, 3, ],
3645            "str" => ["a", "a", "b", "b", "c", "c"]
3646        }
3647        .unwrap();
3648
3649        df.vstack_mut(&df_data).unwrap();
3650        assert_eq!(df.height, 6)
3651    }
3652
3653    #[test]
3654    fn test_replace_or_add() -> PolarsResult<()> {
3655        let mut df = df!(
3656            "a" => [1, 2, 3],
3657            "b" => [1, 2, 3]
3658        )?;
3659
3660        // check that the new column is "c" and not "bar".
3661        df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3662
3663        assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3664        Ok(())
3665    }
3666
3667    #[test]
3668    fn test_unique_keep_none_with_slice() {
3669        let df = df! {
3670            "x" => [1, 2, 3, 2, 1]
3671        }
3672        .unwrap();
3673        let out = df
3674            .unique_stable(
3675                Some(&["x".to_string()][..]),
3676                UniqueKeepStrategy::None,
3677                Some((0, 2)),
3678            )
3679            .unwrap();
3680        let expected = df! {
3681            "x" => [3]
3682        }
3683        .unwrap();
3684        assert!(out.equals(&expected));
3685    }
3686
3687    #[test]
3688    #[cfg(feature = "dtype-i8")]
3689    fn test_apply_result_schema() {
3690        let mut df = df! {
3691            "x" => [1, 2, 3, 2, 1]
3692        }
3693        .unwrap();
3694
3695        let schema_before = df.schema().clone();
3696        df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();
3697        assert_ne!(&schema_before, df.schema());
3698    }
3699}