o
    ?Di*                     @   s8  d dl mZ d dlmZmZmZmZmZmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlZd dlmZmZmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ erd dl%Z&d dl'm(Z) ddl*m+Z+m,Z, ddl-m.Z. ddlmZ ddl/m0Z0m1Z1m2Z2 G dd dZ(dgZ3dS )    )reduce)
TYPE_CHECKINGAnyCallableListDictOptionalTupleUnioncastoverloadN)	iskeyword)ColumnExpression
ExpressionStarExpression   )ColumnOrName   )PySparkTypeErrorPySparkValueErrorPySparkIndexErrorContributionsAcceptedError)ColumnDataFrameWriter)duckdb_to_spark_schema)Row
StructType)	DataFrameGroupedDataGrouping)SparkSession)r   )_to_column_exprcollitc                       s  e Zd ZdejddfddZdzdd	Zd{ddZd|ddZde	ddfddZ
de	ddfddZde	de	dd fddZde	dedd fddZdee	ef dd fddZdee	e	f dd fdd Zd!ed" d#ed$edd fd%d&Zd'ee	eeee	ef  f d$edd fd(d)ZeZd}d*ee deee ee f fd+d,ZeZd-edee fd.d/Zd~d2d3ZeZ dd4d5Z!e"dee	 fd6d7Z#dee	 fd8d9Z$dee	 f fd:d;Z%		dd<d d=eee	ee	 eee f  d>ee	 dd fd?d@Z&ddAdBZ'dCe	dd fdDdEZ(ddFdGZ)de	fdHdIZ*d-edd fdJdKZ+dLe	fdMdNZ,e"de-fdOdPZ.e/dLeee	f defdQdRZ0e/dLeeee1f dd fdSdRZ0dLeee	eee1f deed f fdTdRZ0de	defdUdVZ2e/ddXdYZ3e/dZeee ee	 f ddWfd[dYZ3dd\dYZ3e3Z4e"de5fd]d^Z6d_d` Z7ddadbZ8e8Z9	cdd<d dde:dd fdedfZ;ddgdhZ<ddidjZ=ddkdlZ>d}dmeee	  dd fdndoZ?e?Z@ddpdqZAdefdrdsZBddtduZCddvdwZDdee fdxdyZE  ZFS )r   relationsessionr#   c                 C   s8   || _ || _d | _| j d urt| j j| j j| _d S d S N)r'   r(   _schemar   columnstypes)selfr'   r(    r.   `/var/www/Datamplify/venv/lib/python3.10/site-packages/duckdb/experimental/spark/sql/dataframe.py__init__(   s   
zDataFrame.__init__returnNc                 K   s   | j   d S r)   )r'   show)r-   kwargsr.   r.   r/   r2   /      zDataFrame.showPandasDataFramec                 C   
   | j  S r)   )r'   dfr-   r.   r.   r/   toPandas2      
zDataFrame.toPandaspa.Tablec                 C   r6   )a  
        Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.

        This is only available if PyArrow is installed and available.

        .. versionadded:: 4.0.0

        Notes
        -----
        This method should only be used if the resulting PyArrow ``pyarrow.Table`` is
        expected to be small, as all the data is loaded into the driver's memory.

        This API is a developer API.

        Examples
        --------
        >>> df.toArrow()  # doctest: +SKIP
        pyarrow.Table
        age: int64
        name: string
        ----
        age: [[2,5]]
        name: [["Alice","Bob"]]
        )r'   arrowr8   r.   r.   r/   toArrow5   s   
zDataFrame.toArrownamec                 C   s   | j |d dS )a\  Creates or replaces a local temporary view with this :class:`DataFrame`.

        The lifetime of this temporary table is tied to the :class:`SparkSession`
        that was used to create this :class:`DataFrame`.

        Parameters
        ----------
        name : str
            Name of the view.

        Examples
        --------
        Create a local temporary view named 'people'.

        >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
        >>> df.createOrReplaceTempView("people")

        Replace the local temporary view.

        >>> df2 = df.filter(df.age > 3)
        >>> df2.createOrReplaceTempView("people")
        >>> df3 = spark.sql("SELECT * FROM people")
        >>> sorted(df3.collect()) == sorted(df2.collect())
        True
        >>> spark.catalog.dropTempView("people")
        True

        TN)r'   create_viewr-   r>   r.   r.   r/   createOrReplaceTempViewP   s   z!DataFrame.createOrReplaceTempViewc                 C      t r)   )NotImplementedErrorr@   r.   r.   r/   createGlobalTempViewo      zDataFrame.createGlobalTempView
columnNamenewNamec                 C   sn   || j vrtd| g }| j jD ]}t|}| | kr%||}|| q| j j| }t|| j	S )Nz*DataFrame does not contain a column named )
r'   
ValueErrorr+   r   casefoldaliasappendselectr   r(   )r-   rF   rG   colsxr%   relr.   r.   r/   withColumnRenamedr   s   

zDataFrame.withColumnRenamedr%   c                 C   s   t |tstddt|jdd|| jv r9g }| jjD ]}| | kr0||j	
| q|t| qndd | jjD }||j	
| | jj| }t|| jS )N
NOT_COLUMNr%   arg_namearg_typeerror_classmessage_parametersc                 S      g | ]}t |qS r.   r   .0rN   r.   r.   r/   
<listcomp>       z(DataFrame.withColumn.<locals>.<listcomp>)
isinstancer   r   type__name__r'   r+   rI   rK   exprrJ   r   rL   r   r(   )r-   rF   r%   rM   rN   rO   r.   r.   r/   
withColumn~   s    

zDataFrame.withColumncolsMapc                 G   s   t |dksJ |d }t|tstddt|jddt| }t| }dd |D }g }| j	j
D ](}| |v rW||}||}||}	||	j| q6|t| q6t||D ]\}}	||	j| qd| j	j| }
t|
| jS )	a  
        Returns a new :class:`DataFrame` by adding multiple columns or replacing the
        existing columns that have the same names.

        The colsMap is a map of column name and column, the column must only refer to attributes
        supplied by this Dataset. It is an error to add columns that refer to some other Dataset.

        .. versionadded:: 3.3.0
           Added support for multiple columns adding

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        colsMap : dict
            a dict of column name and :class:`Column`. Currently, only a single map is supported.

        Returns
        -------
        :class:`DataFrame`
            DataFrame with new or replaced columns.

        Examples
        --------
        >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
        >>> df.withColumns({'age2': df.age + 2, 'age3': df.age + 3}).show()
        +---+-----+----+----+
        |age| name|age2|age3|
        +---+-----+----+----+
        |  2|Alice|   4|   5|
        |  5|  Bob|   7|   8|
        +---+-----+----+----+
        r   r   NOT_DICTrc   rR   rU   c                 S      g | ]}|  qS r.   rI   rZ   r.   r.   r/   r\      r]   z)DataFrame.withColumns.<locals>.<listcomp>)lenr^   dictr   r_   r`   listkeysvaluesr'   r+   rI   indexpoprK   ra   rJ   r   ziprL   r   r(   )r-   rc   column_namesr+   column_names_for_comparisonrM   rN   idxcol_namer%   rO   r.   r.   r/   withColumns   s0   $



zDataFrame.withColumnsc                 C   s   t |tstddt|jddt| t| jj }|r)t	dd
| t| }dd |D }g }| jjD ]$}t|}| |v r[||}||}	||	 }
||
}|| q<| jj| }t|| jS )	an  
        Returns a new :class:`DataFrame` by renaming multiple columns.
        This is a no-op if the schema doesn't contain the given column names.

        .. versionadded:: 3.4.0
           Added support for multiple columns renaming

        Parameters
        ----------
        colsMap : dict
            a dict of existing column names and corresponding desired column names.
            Currently, only a single map is supported.

        Returns
        -------
        :class:`DataFrame`
            DataFrame with renamed columns.

        See Also
        --------
        :meth:`withColumnRenamed`

        Notes
        -----
        Support Spark Connect

        Examples
        --------
        >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
        >>> df = df.withColumns({'age2': df.age + 2, 'age3': df.age + 3})
        >>> df.withColumnsRenamed({'age2': 'age4', 'age3': 'age5'}).show()
        +---+-----+----+----+
        |age| name|age4|age5|
        +---+-----+----+----+
        |  2|Alice|   4|   5|
        |  5|  Bob|   7|   8|
        +---+-----+----+----+
        rd   rc   rR   rU   z&DataFrame does not contain column(s): , c                 S   re   r.   rf   rZ   r.   r.   r/   r\     r]   z0DataFrame.withColumnsRenamed.<locals>.<listcomp>)r^   rh   r   r_   r`   setrj   r'   r+   rH   joinri   r   rI   rl   rm   rJ   rK   rL   r   r(   )r-   rc   unknown_columnsold_column_namesold_column_names_for_comparisonrM   rN   r%   rq   rr   new_col_namerO   r.   r.   r/   withColumnsRenamed   s.   
'


zDataFrame.withColumnsRenamedfunc).r   argsr3   c                 O   s4   || g|R i |}t |tsJ dt| |S )af  Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations.

        .. versionadded:: 3.0.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        func : function
            a function that takes and returns a :class:`DataFrame`.
        *args
            Positional arguments to pass to func.

            .. versionadded:: 3.3.0
        **kwargs
            Keyword arguments to pass to func.

            .. versionadded:: 3.3.0

        Returns
        -------
        :class:`DataFrame`
            Transformed DataFrame.

        Examples
        --------
        >>> from pyspark.sql.functions import col
        >>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
        >>> def cast_all_to_int(input_df):
        ...     return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
        ...
        >>> def sort_columns_asc(input_df):
        ...     return input_df.select(*sorted(input_df.columns))
        ...
        >>> df.transform(cast_all_to_int).transform(sort_columns_asc).show()
        +-----+---+
        |float|int|
        +-----+---+
        |    1|  1|
        |    2|  2|
        +-----+---+

        >>> def add_n(input_df, n):
        ...     return input_df.select([(col(col_name) + n).alias(col_name)
        ...                             for col_name in input_df.columns])
        >>> df.transform(add_n, 1).transform(add_n, n=10).show()
        +---+-----+
        |int|float|
        +---+-----+
        | 12| 12.0|
        | 13| 13.0|
        +---+-----+
        zCFunc returned an instance of type [%s], should have been DataFrame.)r^   r   r_   )r-   r|   r}   r3   resultr.   r.   r/   	transform&  s   9zDataFrame.transformrM   c                 O   sD  |s
t dddidt|dkrt|d tr|d }g }|D ]<}|}t|tr-t|}n)t|trVt|tsV|dkrB| |d  }n|dk rP| | d   }nt	di d|
| q|dd	}t|ttfrs|srd
d |D }nt|trdd t||D }ntddt|jdddd |D }| jj| }t|| jS )a  Returns a new :class:`DataFrame` sorted by the specified column(s).

        Parameters
        ----------
        cols : str, list, or :class:`Column`, optional
             list of :class:`Column` or column names to sort by.

        Other Parameters
        ----------------
        ascending : bool or list, optional, default True
            boolean or list of boolean.
            Sort ascending vs. descending. Specify list for multiple sort orders.
            If a list is specified, the length of the list must equal the length of the `cols`.

        Returns
        -------
        :class:`DataFrame`
            Sorted DataFrame.

        Examples
        --------
        >>> from pyspark.sql.functions import desc, asc
        >>> df = spark.createDataFrame([
        ...     (2, "Alice"), (5, "Bob")], schema=["age", "name"])

        Sort the DataFrame in ascending order.

        >>> df.sort(asc("age")).show()
        +---+-----+
        |age| name|
        +---+-----+
        |  2|Alice|
        |  5|  Bob|
        +---+-----+

        Sort the DataFrame in descending order.

        >>> df.sort(df.age.desc()).show()
        +---+-----+
        |age| name|
        +---+-----+
        |  5|  Bob|
        |  2|Alice|
        +---+-----+
        >>> df.orderBy(df.age.desc()).show()
        +---+-----+
        |age| name|
        +---+-----+
        |  5|  Bob|
        |  2|Alice|
        +---+-----+
        >>> df.sort("age", ascending=False).show()
        +---+-----+
        |age| name|
        +---+-----+
        |  5|  Bob|
        |  2|Alice|
        +---+-----+

        Specify multiple columns

        >>> df = spark.createDataFrame([
        ...     (2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"])
        >>> df.orderBy(desc("age"), "name").show()
        +---+-----+
        |age| name|
        +---+-----+
        |  5|  Bob|
        |  2|Alice|
        |  2|  Bob|
        +---+-----+

        Specify multiple columns for sorting order at `ascending`.

        >>> df.orderBy(["age", "name"], ascending=[False, False]).show()
        +---+-----+
        |age| name|
        +---+-----+
        |  5|  Bob|
        |  2|  Bob|
        |  2|Alice|
        +---+-----+
        CANNOT_BE_EMPTYitemcolumnrU   r   r   
ZERO_INDEX	ascendingTc                 S   re   r.   descr[   cr.   r.   r/   r\     r]   z"DataFrame.sort.<locals>.<listcomp>c                 S   s    g | ]\}}|r
|n|  qS r.   r   )r[   ascr   r.   r.   r/   r\     s     NOT_BOOL_OR_LISTrR   c                 S   rX   r.   r$   r   r.   r.   r/   r\     r]   )r   rg   r^   ri   strr%   intboolr   r   rK   getrn   r   r_   r`   r'   sortr   r(   )r-   rM   r3   r+   r   _cr   rO   r.   r.   r/   r   f  sH   V


zDataFrame.sortnc                 C   s,   |d u r|  d}|r|d S d S | |S )Nr   r   )headtake)r-   r   rsr.   r.   r/   r     s   

zDataFrame.headnumc                 C      |  | S r)   )limitcollect)r-   r   r.   r.   r/   r     r4   zDataFrame.take	conditionr   c                 C   sR   t |tr	|j}nt |tr|}ntddt|jdd| j|}t	|| j
S )a  Filters rows using the given condition.

        :func:`where` is an alias for :func:`filter`.

        Parameters
        ----------
        condition : :class:`Column` or str
            a :class:`Column` of :class:`types.BooleanType`
            or a string of SQL expressions.

        Returns
        -------
        :class:`DataFrame`
            Filtered DataFrame.

        Examples
        --------
        >>> df = spark.createDataFrame([
        ...     (2, "Alice"), (5, "Bob")], schema=["age", "name"])

        Filter by :class:`Column` instances.

        >>> df.filter(df.age > 3).show()
        +---+----+
        |age|name|
        +---+----+
        |  5| Bob|
        +---+----+
        >>> df.where(df.age == 2).show()
        +---+-----+
        |age| name|
        +---+-----+
        |  2|Alice|
        +---+-----+

        Filter by SQL expression in a string.

        >>> df.filter("age > 3").show()
        +---+----+
        |age|name|
        +---+----+
        |  5| Bob|
        +---+----+
        >>> df.where("age = 2").show()
        +---+-----+
        |age| name|
        +---+-----+
        |  2|Alice|
        +---+-----+
        NOT_COLUMN_OR_STRr   rR   rU   )r^   r   ra   r   r   r_   r`   r'   filterr   r(   )r-   r   condrO   r.   r.   r/   r     s   
3
zDataFrame.filterc                 G   sh   t |}t|dkr|d }t|t rdd |D }nt|tr#|jnt|g}| jj| }t|| j	S )Nr   r   c                 S   s$   g | ]}t |tr|jnt|qS r.   )r^   r   ra   r   rZ   r.   r.   r/   r\   ;  s    z$DataFrame.select.<locals>.<listcomp>)
ri   rg   r^   r   ra   r   r'   rL   r   r(   )r-   rM   projectionsrO   r.   r.   r/   rL   6  s   
zDataFrame.selectc                 C   s   dd | j jD S )z~Returns all column names as a list.

        Examples
        --------
        >>> df.columns
        ['age', 'name']
        c                 S   s   g | ]}|j qS r.   )r>   )r[   fr.   r.   r/   r\   N  s    z%DataFrame.columns.<locals>.<listcomp>)schemafieldsr8   r.   r.   r/   r+   E  s   	zDataFrame.columnsc                 C      | j S r)   )r+   r8   r.   r.   r/   _ipython_key_completions_P  s   z#DataFrame._ipython_key_completions_c                    s,   t t  }|dd | jD  t|S )Nc                 s   s$    | ]}|  rt|s|V  qd S r)   )isidentifierr   r   r.   r.   r/   	<genexpr>W  s   " z$DataFrame.__dir__.<locals>.<genexpr>)ru   super__dir__updater+   sorted)r-   out	__class__r.   r/   r   U  s   zDataFrame.__dir__otheronhowc                 C   s   |durt |ts|g}|dur>tdd |D s>t |ts J dd |D }t |d ts2J dtdd ttt |}|du rN|du rN| j|j}n<|du rTd	}|du r[d
}nt |trltdd |D rl|}nt	|}t |t	syJ ddd }||}| j|j||}t
|| jS )a  Joins with another :class:`DataFrame`, using the given join expression.

        Parameters
        ----------
        other : :class:`DataFrame`
            Right side of the join
        on : str, list or :class:`Column`, optional
            a string for the join column name, a list of column names,
            a join expression (Column), or a list of Columns.
            If `on` is a string or a list of strings indicating the name of the join column(s),
            the column(s) must exist on both sides, and this performs an equi-join.
        how : str, optional
            default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,
            ``full``, ``fullouter``, ``full_outer``, ``left``, ``leftouter``, ``left_outer``,
            ``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,
            ``anti``, ``leftanti`` and ``left_anti``.

        Returns
        -------
        :class:`DataFrame`
            Joined DataFrame.

        Examples
        --------
        The following performs a full outer join between ``df1`` and ``df2``.

        >>> from pyspark.sql import Row
        >>> from pyspark.sql.functions import desc
        >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")]).toDF("age", "name")
        >>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
        >>> df3 = spark.createDataFrame([Row(age=2, name="Alice"), Row(age=5, name="Bob")])
        >>> df4 = spark.createDataFrame([
        ...     Row(age=10, height=80, name="Alice"),
        ...     Row(age=5, height=None, name="Bob"),
        ...     Row(age=None, height=None, name="Tom"),
        ...     Row(age=None, height=None, name=None),
        ... ])

        Inner join on columns (default)

        >>> df.join(df2, 'name').select(df.name, df2.height).show()
        +----+------+
        |name|height|
        +----+------+
        | Bob|    85|
        +----+------+
        >>> df.join(df4, ['name', 'age']).select(df.name, df.age).show()
        +----+---+
        |name|age|
        +----+---+
        | Bob|  5|
        +----+---+

        Outer join for both DataFrames on the 'name' column.

        >>> df.join(df2, df.name == df2.name, 'outer').select(
        ...     df.name, df2.height).sort(desc("name")).show()
        +-----+------+
        | name|height|
        +-----+------+
        |  Bob|    85|
        |Alice|  NULL|
        | NULL|    80|
        +-----+------+
        >>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).show()
        +-----+------+
        | name|height|
        +-----+------+
        |  Tom|    80|
        |  Bob|    85|
        |Alice|  NULL|
        +-----+------+

        Outer join for both DataFrams with multiple columns.

        >>> df.join(
        ...     df3,
        ...     [df.name == df3.name, df.age == df3.age],
        ...     'outer'
        ... ).select(df.name, df3.age).show()
        +-----+---+
        | name|age|
        +-----+---+
        |Alice|  2|
        |  Bob|  5|
        +-----+---+
        Nc                 S      g | ]}t |tqS r.   r^   r   rZ   r.   r.   r/   r\         z"DataFrame.join.<locals>.<listcomp>c                 S   rX   r.   r   rZ   r.   r.   r/   r\     r]   r   z%on should be Column or list of Columnc                 S   s
   |  |S r)   )__and__)rN   yr.   r.   r/   <lambda>  s   
 z DataFrame.join.<locals>.<lambda>innertruec                 S   r   r.   r   rZ   r.   r.   r/   r\     r   zhow should be a stringc                 S   s`   g g dddgddgddgdd	gd
}d }|  D ]\}}| |ks%| |v r)|} nq|s.| }|S )N)full	fullouter
full_outer	leftouter
left_outer
rightouterright_outerleftanti	left_antileftsemi	left_semi)r   outerleftrightantisemi)items)r   known_aliasesmapped_typer_   aliasesr.   r.   r/   map_to_recognized_jointype  s    z2DataFrame.join.<locals>.map_to_recognized_jointype)r^   ri   allr   r   r   r   r'   rv   r   r   r(   )r-   r   r   r   r~   r   r.   r.   r/   rv   Z  s2   ^zDataFrame.joinc                 C      t | j|j| jS )a  Returns the cartesian product with another :class:`DataFrame`.

        .. versionadded:: 2.1.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other : :class:`DataFrame`
            Right side of the cartesian product.

        Returns
        -------
        :class:`DataFrame`
            Joined DataFrame.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame(
        ...     [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
        >>> df2 = spark.createDataFrame(
        ...     [Row(height=80, name="Tom"), Row(height=85, name="Bob")])
        >>> df.crossJoin(df2.select("height")).select("age", "name", "height").show()
        +---+-----+------+
        |age| name|height|
        +---+-----+------+
        | 14|  Tom|    80|
        | 14|  Tom|    85|
        | 23|Alice|    80|
        | 23|Alice|    85|
        | 16|  Bob|    80|
        | 16|  Bob|    85|
        +---+-----+------+
        )r   r'   crossr(   r-   r   r.   r.   r/   	crossJoin     %zDataFrame.crossJoinrJ   c                 C   s&   t |ts	J dt| j|| jS )a  Returns a new :class:`DataFrame` with an alias set.

        Parameters
        ----------
        alias : str
            an alias name to be set for the :class:`DataFrame`.

        Returns
        -------
        :class:`DataFrame`
            Aliased DataFrame.

        Examples
        --------
        >>> from pyspark.sql.functions import col, desc
        >>> df = spark.createDataFrame(
        ...     [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
        >>> df_as1 = df.alias("df_as1")
        >>> df_as2 = df.alias("df_as2")
        >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner')
        >>> joined_df.select(
        ...     "df_as1.name", "df_as2.name", "df_as2.age").sort(desc("df_as1.name")).show()
        +-----+-----+---+
        | name| name|age|
        +-----+-----+---+
        |  Tom|  Tom| 14|
        |  Bob|  Bob| 16|
        |Alice|Alice| 23|
        +-----+-----+---+
        zalias should be a string)r^   r   r   r'   	set_aliasr(   )r-   rJ   r.   r.   r/   rJ     s   zDataFrame.aliasc                    s   g }|D ]&}t |tr|| qt |tr||j  qtddt|jdd fdd|D }t	|d}t
 j| jS )Nr   r%   rR   rU   c                    s   g | ]
}| j jv r|qS r.   )r'   r+   rZ   r8   r.   r/   r\   A  s    z"DataFrame.drop.<locals>.<listcomp>)exclude)r^   r   rK   r   ra   get_namer   r_   r`   r   r   r'   rL   r(   )r-   rM   r   r%   ra   r.   r8   r/   drop4  s   


zDataFrame.dropc                 C   s
   t | jS r)   )r   r'   r8   r.   r.   r/   __repr__E  r:   zDataFrame.__repr__c                 C   s   | j |}t|| jS )a  Limits the result count to the number specified.

        Parameters
        ----------
        num : int
            Number of records to return. Will return this number of records
            or all records if the DataFrame contains less than this number of records.

        Returns
        -------
        :class:`DataFrame`
            Subset of the records

        Examples
        --------
        >>> df = spark.createDataFrame(
        ...     [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
        >>> df.limit(1).show()
        +---+----+
        |age|name|
        +---+----+
        | 14| Tom|
        +---+----+
        >>> df.limit(0).show()
        +---+----+
        |age|name|
        +---+----+
        +---+----+
        )r'   r   r   r(   )r-   r   rO   r.   r.   r/   r   H  s   zDataFrame.limitr   c                 C   s
   || j v S )zY
        Check if the :class:`DataFrame` contains a column by the name of `item`
        )r'   r-   r   r.   r.   r/   __contains__i  s   
zDataFrame.__contains__c                 C   r   )a(  Returns the schema of this :class:`DataFrame` as a :class:`duckdb.experimental.spark.sql.types.StructType`.

        Examples
        --------
        >>> df.schema
        StructType([StructField('age', IntegerType(), True),
                    StructField('name', StringType(), True)])
        )r*   r8   r.   r.   r/   r   o  s   
zDataFrame.schemac                 C      d S r)   r.   r   r.   r.   r/   __getitem__{     zDataFrame.__getitem__c                 C   r   r)   r.   r   r.   r.   r/   r     r   c                 C   sv   t |trtt| jj|S t |tr| |S t |tt	fr%| j
| S t |tr2t| j| jS tdt| )a  Returns the column as a :class:`Column`.

        Examples
        --------
        >>> df.select(df['age']).collect()
        [Row(age=2), Row(age=5)]
        >>> df[ ["name", "age"]].collect()
        [Row(name='Alice', age=2), Row(name='Bob', age=5)]
        >>> df[ df.age > 3 ].collect()
        [Row(age=5, name='Bob')]
        >>> df[df[0] > 3].collect()
        [Row(age=5, name='Bob')]
        zUnexpected item type: )r^   r   r   duckdbr   r'   rJ   r   ri   tuplerL   r   r%   r*   r>   	TypeErrorr_   r   r.   r.   r/   r     s   




c                 C   s4   || j jvrtd| jj|f tt| j j|S )zReturns the :class:`Column` denoted by ``name``.

        Examples
        --------
        >>> df.select(df.age).collect()
        [Row(age=2), Row(age=5)]
        z!'%s' object has no attribute '%s')	r'   r+   AttributeErrorr   r`   r   r   r   rJ   r@   r.   r.   r/   __getattr__  s
   zDataFrame.__getattr__r!   c                 G   r   r)   r.   )r-   rM   r.   r.   r/   groupBy  r   zDataFrame.groupBy_DataFrame__colsc                 C   r   r)   r.   )r-   r   r.   r.   r/   r     r   c                 G   sF   ddl m}m} t|dkrt|d tr|d }n|}||| | S )aM  Groups the :class:`DataFrame` using the specified columns,
        so we can run aggregation on them. See :class:`GroupedData`
        for all the available aggregate functions.

        :func:`groupby` is an alias for :func:`groupBy`.

        Parameters
        ----------
        cols : list, str or :class:`Column`
            columns to group by.
            Each element should be a column name (string) or an expression (:class:`Column`)
            or list of them.

        Returns
        -------
        :class:`GroupedData`
            Grouped data by given columns.

        Examples
        --------
        >>> df = spark.createDataFrame([
        ...     (2, "Alice"), (2, "Bob"), (2, "Bob"), (5, "Bob")], schema=["age", "name"])

        Empty grouping columns triggers a global aggregation.

        >>> df.groupBy().avg().show()
        +--------+
        |avg(age)|
        +--------+
        |    2.75|
        +--------+

        Group-by 'name', and specify a dictionary to calculate the summation of 'age'.

        >>> df.groupBy("name").agg({"age": "sum"}).sort("name").show()
        +-----+--------+
        | name|sum(age)|
        +-----+--------+
        |Alice|       2|
        |  Bob|       9|
        +-----+--------+

        Group-by 'name', and calculate maximum values.

        >>> df.groupBy(df.name).max().sort("name").show()
        +-----+--------+
        | name|max(age)|
        +-----+--------+
        |Alice|       2|
        |  Bob|       5|
        +-----+--------+

        Group-by 'name' and 'age', and calculate the number of rows in each group.

        >>> df.groupBy(["name", df.age]).count().sort("name", "age").show()
        +-----+---+-----+
        | name|age|count|
        +-----+---+-----+
        |Alice|  2|    1|
        |  Bob|  2|    2|
        |  Bob|  5|    1|
        +-----+---+-----+
        r   r    r   )groupr!   r"   rg   r^   ri   )r-   rM   r!   r"   r+   r.   r.   r/   r     s
   @
c                 C   s   t | S r)   r   r8   r.   r.   r/   write  s   zDataFrame.writec                 C   rB   r)   r   r8   r.   r.   r/   printSchema  rE   zDataFrame.printSchemac                 C   r   )a  Return a new :class:`DataFrame` containing union of rows in this and another
        :class:`DataFrame`.

        Parameters
        ----------
        other : :class:`DataFrame`
            Another :class:`DataFrame` that needs to be unioned

        Returns
        -------
        :class:`DataFrame`

        See Also
        --------
        DataFrame.unionAll

        Notes
        -----
        This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
        (that does deduplication of elements), use this function followed by :func:`distinct`.

        Also as standard in SQL, this function resolves columns by position (not by name).

        Examples
        --------
        >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
        >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
        >>> df1.union(df2).show()
        +----+----+----+
        |col0|col1|col2|
        +----+----+----+
        |   1|   2|   3|
        |   4|   5|   6|
        +----+----+----+
        >>> df1.union(df1).show()
        +----+----+----+
        |col0|col1|col2|
        +----+----+----+
        |   1|   2|   3|
        |   1|   2|   3|
        +----+----+----+
        )r   r'   unionr(   r   r.   r.   r/   r     s   +zDataFrame.unionFallowMissingColumnsc                 C   sl   |r$g }| j jD ]}||j jv r|| q|td q|j| }n|j| j j }t| j |j | jS )a  Returns a new :class:`DataFrame` containing union of rows in this and another
        :class:`DataFrame`.

        This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set
        union (that does deduplication of elements), use this function followed by :func:`distinct`.

        .. versionadded:: 2.3.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other : :class:`DataFrame`
            Another :class:`DataFrame` that needs to be combined.
        allowMissingColumns : bool, optional, default False
           Specify whether to allow missing columns.

           .. versionadded:: 3.1.0

        Returns
        -------
        :class:`DataFrame`
            Combined DataFrame.

        Examples
        --------
        The difference between this function and :func:`union` is that this function
        resolves columns by name (not by position):

        >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
        >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
        >>> df1.unionByName(df2).show()
        +----+----+----+
        |col0|col1|col2|
        +----+----+----+
        |   1|   2|   3|
        |   6|   4|   5|
        +----+----+----+

        When the parameter `allowMissingColumns` is ``True``, the set of column names
        in this and other :class:`DataFrame` can differ; missing columns will be filled with null.
        Further, the missing columns of this :class:`DataFrame` will be added at the end
        in the schema of the union result:

        >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
        >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col3"])
        >>> df1.unionByName(df2, allowMissingColumns=True).show()
        +----+----+----+----+
        |col0|col1|col2|col3|
        +----+----+----+----+
        |   1|   2|   3|NULL|
        |NULL|   4|   5|   6|
        +----+----+----+----+
        N)r'   r+   rK   r&   rL   r   r   r(   )r-   r   r   rM   r%   r.   r.   r/   unionByName4  s   :zDataFrame.unionByNamec                 C   r   )a
  Return a new :class:`DataFrame` containing rows only in
        both this :class:`DataFrame` and another :class:`DataFrame`.
        Note that any duplicates are removed. To preserve duplicates
        use :func:`intersectAll`.

        .. versionadded:: 1.3.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other : :class:`DataFrame`
            Another :class:`DataFrame` that needs to be combined.

        Returns
        -------
        :class:`DataFrame`
            Combined DataFrame.

        Notes
        -----
        This is equivalent to `INTERSECT` in SQL.

        Examples
        --------
        >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
        >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
        >>> df1.intersect(df2).sort(df1.C1.desc()).show()
        +---+---+
        | C1| C2|
        +---+---+
        |  b|  3|
        |  a|  1|
        +---+---+
        )intersectAlldrop_duplicatesr   r.   r.   r/   	intersect{  s   %zDataFrame.intersectc                 C   r   )a
  Return a new :class:`DataFrame` containing rows in both this :class:`DataFrame`
        and another :class:`DataFrame` while preserving duplicates.

        This is equivalent to `INTERSECT ALL` in SQL. As standard in SQL, this function
        resolves columns by position (not by name).

        .. versionadded:: 2.4.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other : :class:`DataFrame`
            Another :class:`DataFrame` that needs to be combined.

        Returns
        -------
        :class:`DataFrame`
            Combined DataFrame.

        Examples
        --------
        >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
        >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
        >>> df1.intersectAll(df2).sort("C1", "C2").show()
        +---+---+
        | C1| C2|
        +---+---+
        |  a|  1|
        |  a|  1|
        |  b|  3|
        +---+---+
        )r   r'   r   r(   r   r.   r.   r/   r     s   #zDataFrame.intersectAllc                 C   r   )a  Return a new :class:`DataFrame` containing rows in this :class:`DataFrame` but
        not in another :class:`DataFrame` while preserving duplicates.

        This is equivalent to `EXCEPT ALL` in SQL.
        As standard in SQL, this function resolves columns by position (not by name).

        .. versionadded:: 2.4.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other : :class:`DataFrame`
            The other :class:`DataFrame` to compare to.

        Returns
        -------
        :class:`DataFrame`

        Examples
        --------
        >>> df1 = spark.createDataFrame(
        ...         [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b",  3), ("c", 4)], ["C1", "C2"])
        >>> df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
        >>> df1.exceptAll(df2).show()
        +---+---+
        | C1| C2|
        +---+---+
        |  a|  1|
        |  a|  1|
        |  a|  2|
        |  c|  4|
        +---+---+

        )r   r'   except_r(   r   r.   r.   r/   	exceptAll  r   zDataFrame.exceptAllsubsetc                 C   sl   |r2dt  j }ddd |D }d| d| }t| j|d| j}|| d	|S | 
 S )	a  Return a new :class:`DataFrame` with duplicate rows removed,
        optionally only considering certain columns.

        For a static batch :class:`DataFrame`, it just drops duplicate rows. For a streaming
        :class:`DataFrame`, it will keep all data across triggers as intermediate state to drop
        duplicates rows. You can use :func:`withWatermark` to limit how late the duplicate data can
        be and the system will accordingly limit the state. In addition, data older than
        watermark will be dropped to avoid any possibility of duplicates.

        :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.

        Parameters
        ----------
        subset : List of column names, optional
            List of columns to use for duplicate comparison (default All columns).

        Returns
        -------
        :class:`DataFrame`
            DataFrame without duplicates.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame([
        ...     Row(name='Alice', age=5, height=80),
        ...     Row(name='Alice', age=5, height=80),
        ...     Row(name='Alice', age=10, height=80)
        ... ])

        Deduplicate the same rows.

        >>> df.dropDuplicates().show()
        +-----+---+------+
        | name|age|height|
        +-----+---+------+
        |Alice|  5|    80|
        |Alice| 10|    80|
        +-----+---+------+

        Deduplicate values on 'name' and 'height' columns.

        >>> df.dropDuplicates(['name', 'height']).show()
        +-----+---+------+
        | name|age|height|
        +-----+---+------+
        |Alice|  5|    80|
        +-----+---+------+
        tmp_col_rt   c                 S   s   g | ]}d | d qS )"r.   r   r.   r.   r/   r\   "  s    z,DataFrame.dropDuplicates.<locals>.<listcomp>zOVER(PARTITION BY z) AS *z = 1)uuiduuid1hexrv   r   r'   
row_numberr(   r   r   distinct)r-   r   rn_col
subset_strwindow_specr7   r.   r.   r/   dropDuplicates  s   2zDataFrame.dropDuplicatesc                 C   s   | j  }t|| jS )a  Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.

        Returns
        -------
        :class:`DataFrame`
            DataFrame with distinct records.

        Examples
        --------
        >>> df = spark.createDataFrame(
        ...     [(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])

        Return the number of distinct rows in the :class:`DataFrame`

        >>> df.distinct().count()
        2
        )r'   r   r   r(   )r-   distinct_relr.   r.   r/   r   ,  s   
zDataFrame.distinctc                 C   s   | j d}t| d S )a  Returns the number of rows in this :class:`DataFrame`.

        Returns
        -------
        int
            Number of rows.

        Examples
        --------
        >>> df = spark.createDataFrame(
        ...     [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])

        Return the number of rows in the :class:`DataFrame`.

        >>> df.count()
        3
        r   r   )r'   countr   fetchone)r-   	count_relr.   r.   r/   r   A  s   zDataFrame.countc                 G   sV   | j j}t|}|t|ksJ dd t||D }d|}| j |}t|| jS )Nc                 S   s$   g | ]\}}| d | d| qS )z::z as r.   )r[   existingtarget_typer.   r.   r/   r\   [  s    z)DataFrame._cast_types.<locals>.<listcomp>rt   )r'   r+   rg   rn   rv   projectr   r(   )r-   r,   existing_columnstypes_countcast_expressionsnew_relr.   r.   r/   _cast_typesV  s   
zDataFrame._cast_typesc                 G   s`   | j j}t|}|t|krtdddd |D }dd t||D }| j j| }t|| jS )NzHProvided column names and number of columns in the DataFrame don't match)messagec                 S   rX   r.   rY   rZ   r.   r.   r/   r\   k  r]   z"DataFrame.toDF.<locals>.<listcomp>c                 S   s   g | ]	\}}| |qS r.   )rJ   )r[   r   newr.   r.   r/   r\   l  s    )r'   r+   rg   r   rn   r  r   r(   )r-   rM   r  column_countr   r  r.   r.   r/   toDFc  s   zDataFrame.toDFc                    s8   | j j | j  }dtfdd fdd|D }|S )Nr1   c                 S   s   t tt| }t||_|S r)   )r   __new__r   ri   
__fields__)rk   namesrowr.   r.   r/   construct_rowv  s   
z(DataFrame.collect.<locals>.construct_rowc                    s   g | ]}| qS r.   r.   rZ   r+   r  r.   r/   r\   {  r   z%DataFrame.collect.<locals>.<listcomp>)r'   r+   fetchallr   )r-   r~   rowsr.   r  r/   r   r  s
   
zDataFrame.collect)r1   N)r1   r5   )r1   r;   r)   )r   r   r1   r   )r1   r   )NN)r   r   r1   r   )rM   r   r1   r   )rM   r   r1   r!   )F)Gr`   
__module____qualname__r   DuckDBPyRelationr0   r2   r9   r=   r   rA   rD   rP   r   rb   r   rs   r{   r   r   r   r
   r   r   orderByr   r   r   r   firstr   r   whererL   propertyr+   r   r   rv   r   rJ   r   r   r   r   r   r   r   r   r	   r   r   groupbyr   r   r   r   unionAllr   r   r   r   r   r   r   r   r   r  r  r   __classcell__r.   r.   r   r/   r   '   s    


KI
@
 (
?


 
'
"!

$
H
-

G
'
%';


r   )4	functoolsr   typingr   r   r   r   r   r   r	   r
   r   r   r   keywordr   r   r   r   r   _typingr   errorsr   r   r   	exceptionr   r   r   
readwriterr   
type_utilsr   r,   r   r   pyarrowpapandas.core.framer   r5   r   r!   r"   r(   r#   	functionsr$   r%   r&   __all__r.   r.   r.   r/   <module>   s@    0          
b