Source code for pandas_diff.pandas_diff
"""Main module."""
from __future__ import annotations
import pandas as pd
from pandas_diff.pre_process import pre_process
from pandas_diff.process_results import format_results_create_delete, format_results_modify
[docs]
def get_diffs(
before: pd.DataFrame,
after: pd.DataFrame,
keys: list[str] | str,
ignore_columns: list[str] | None = None,
) -> pd.DataFrame:
"""Generate DataFrame with differences between two DataFrames.
Args:
before: DataFrame representing the previous state.
after: DataFrame representing the current state.
keys: Column name(s) used to identify rows.
ignore_columns: Columns to exclude from comparison.
Returns:
DataFrame with columns: operation, object_keys, object_values,
object_json, attribute_changed, old_value, new_value.
Raises:
TypeError: If before or after are not DataFrames.
ValueError: If keys is empty or not found in columns.
"""
if not isinstance(before, pd.DataFrame):
raise TypeError(f"before must be a pandas DataFrame, got {type(before).__name__}")
if not isinstance(after, pd.DataFrame):
raise TypeError(f"after must be a pandas DataFrame, got {type(after).__name__}")
if ignore_columns is None:
ignore_columns = []
results = []
A, B = before, after
A, B, keys = pre_process(A, B, keys)
A["___keys"] = [str(d) for d in A[keys].to_dict(orient="records")]
B["___keys"] = [str(d) for d in B[keys].to_dict(orient="records")]
# Added elements are in B but not in A
deleted_keys = list(set(A["___keys"].values) - set(B["___keys"].values))
# Deleted elements are in A but not in B
added_keys = list(set(B["___keys"].values) - set(A["___keys"].values))
A = A.set_index("___keys")
B = B.set_index("___keys")
results = []
for added_key in added_keys:
result = format_results_create_delete(B.loc[added_key, :], "create", keys)
results.append(result)
for deleted_key in deleted_keys:
result = format_results_create_delete(A.loc[deleted_key, :], "delete", keys)
results.append(result)
common_keys = list(set(A.index.values) & set(B.index.values))
columns_not_keys = list(set(A.columns.values) - set(keys))
for common_key in common_keys:
for col in columns_not_keys:
if col in ignore_columns:
continue
# Check if the value has changed
are_different_non_null_values = A.loc[common_key, col] != B.loc[common_key, col] and not (
pd.isna(A.loc[common_key, col]) and pd.isna(B.loc[common_key, col])
)
if are_different_non_null_values:
result = format_results_modify(
row=B.loc[common_key, :],
keys=keys,
attribute_changed=col,
old_value=A.loc[common_key, col],
new_value=B.loc[common_key, col],
)
results.append(result)
df = pd.DataFrame(results)
return df