From 3eb5403e20351467e6ce194af104d1453c8a6d5e Mon Sep 17 00:00:00 2001 From: herp2021 Date: Fri, 4 Jul 2025 17:35:35 +0530 Subject: [PATCH] Adding a Cache check & a deletion button now shows the cache size in MB before the clear button. This helps you monitor how much space is being used before cleanup. --- col_compare.py | 175 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 132 insertions(+), 43 deletions(-) diff --git a/col_compare.py b/col_compare.py index 66bf5e0..f92adc7 100644 --- a/col_compare.py +++ b/col_compare.py @@ -1,86 +1,175 @@ import streamlit as st import pandas as pd import io +import shutil +import os -st.title("๐Ÿ” Excel Comparator (Full Rows from Multi-Column Match)") +st.set_page_config(page_title="Excel Comparator", layout="wide") +st.title("๐Ÿ” Excel Comparator โ€” Multi-Column Match + Unique Viewer") -file1 = st.file_uploader("๐Ÿ“„ Upload First Excel File", type=["xlsx", "xls"], key="file1") -file2 = st.file_uploader("๐Ÿ“„ Upload Second Excel File", type=["xlsx", "xls"], key="file2") +# Clear Streamlit Cache Button +cache_path = os.path.expanduser("~/.cache/streamlit") + +def get_dir_size(path): + total_size = 0 + for dirpath, dirnames, filenames in os.walk(path): + for f in filenames: + fp = os.path.join(dirpath, f) + if os.path.isfile(fp): + total_size += os.path.getsize(fp) + return total_size / (1024 * 1024) # size in MB + +cache_size_mb = get_dir_size(cache_path) if os.path.exists(cache_path) else 0 +st.sidebar.write(f"๐Ÿ—‚๏ธ Cache Size: {cache_size_mb:.2f} MB") + +if st.sidebar.button("๐Ÿงน Clear Streamlit Cache"): + try: + shutil.rmtree(cache_path) + st.sidebar.success("โœ… Cache cleared successfully.") + except Exception as e: + st.sidebar.error(f"โš ๏ธ Error clearing cache: {e}") + +file1 = st.file_uploader("๐Ÿ“„ Upload First Excel File", type=["xlsx", "xls"]) +file2 = st.file_uploader("๐Ÿ“„ Upload Second Excel File", type=["xlsx", "xls"]) if file1 and file2: xls1 = pd.ExcelFile(file1) xls2 = pd.ExcelFile(file2) - sheet1 = st.selectbox("๐Ÿ“‘ Select sheet from File 1", xls1.sheet_names, key="sheet1") - sheet2 = st.selectbox("๐Ÿ“‘ Select sheet from File 2", xls2.sheet_names, key="sheet2") + sheet1 = st.selectbox("๐Ÿ“‘ Select Sheet from File 1", xls1.sheet_names) + sheet2 = st.selectbox("๐Ÿ“‘ Select Sheet from File 2", xls2.sheet_names) df1 = pd.read_excel(xls1, sheet_name=sheet1) df2 = pd.read_excel(xls2, sheet_name=sheet2) - st.write("**File 1 Preview:**") + st.write("### ๐Ÿ” Preview โ€” File 1") st.dataframe(df1.head()) - st.write("**File 2 Preview:**") + st.write("### ๐Ÿ” Preview โ€” File 2") st.dataframe(df2.head()) - st.subheader("๐Ÿ”ง Select columns to match by") - cols1 = st.multiselect("File 1 columns", df1.columns, key="cols1") - cols2 = st.multiselect("File 2 columns", df2.columns, key="cols2") + st.subheader("๐Ÿ“Œ Unique Values Finder") + with st.expander("๐Ÿ”Ž Find Unique Values from Any Column"): + file_choice = st.radio("Choose file", ["File 1", "File 2"], horizontal=True) + + if file_choice == "File 1": + uniq_col = st.selectbox("Select column from File 1", df1.columns, key="u1") + if uniq_col: + uniq_vals = sorted(df1[uniq_col].dropna().astype(str).unique()) + df_unique = pd.DataFrame(uniq_vals, columns=[f"Unique in {uniq_col}"]) + st.write(f"Found **{len(df_unique)}** unique values in **{uniq_col}** (File 1)") + st.dataframe(df_unique) + st.download_button("โฌ‡๏ธ Download CSV", df_unique.to_csv(index=False), + file_name=f"unique_{uniq_col}_file1.csv", mime="text/csv") + + else: + uniq_col = st.selectbox("Select column from File 2", df2.columns, key="u2") + if uniq_col: + uniq_vals = sorted(df2[uniq_col].dropna().astype(str).unique()) + df_unique = pd.DataFrame(uniq_vals, columns=[f"Unique in {uniq_col}"]) + st.write(f"Found **{len(df_unique)}** unique values in **{uniq_col}** (File 2)") + st.dataframe(df_unique) + st.download_button("โฌ‡๏ธ Download CSV", df_unique.to_csv(index=False), + file_name=f"unique_{uniq_col}_file2.csv", mime="text/csv") + + st.subheader("๐Ÿ”ง Select Columns for Matching") + cols1 = st.multiselect("Matching columns from File 1", df1.columns, key="cols1") + cols2 = st.multiselect("Matching columns from File 2", df2.columns, key="cols2") + + show_merge = st.checkbox("๐Ÿ”€ Show Merged Matched Rows (Side-by-Side)", value=True) + highlight_diffs = st.checkbox("๐ŸŽจ Highlight Differences in Merged View", value=True) + + def detect_column_types(df, columns): + return {col: str(df[col].dropna().map(type).mode()[0]).split("'")[-2] for col in columns if col in df} + + if cols1 and cols2: + col_types1 = detect_column_types(df1, cols1) + col_types2 = detect_column_types(df2, cols2) + with st.expander("๐Ÿงช Detected Column Types"): + st.write("**File 1 Column Types:**", col_types1) + st.write("**File 2 Column Types:**", col_types2) + + def normalize_columns(df, columns): + def clean(val): + if pd.isna(val): + return "" + if isinstance(val, float) and val.is_integer(): + return str(int(val)) + return str(val).strip() + return df[columns].applymap(clean).agg(" | ".join, axis=1) if len(cols1) != len(cols2): st.warning("โš ๏ธ Please select the same number of columns from both files.") elif cols1 and cols2 and st.button("๐Ÿ” Compare Now"): - # Create composite key in both dataframes - df1["__key__"] = df1[cols1].astype(str).agg(" | ".join, axis=1) - df2["__key__"] = df2[cols2].astype(str).agg(" | ".join, axis=1) + df1["__key__"] = normalize_columns(df1, cols1) + df2["__key__"] = normalize_columns(df2, cols2) - # Identify match and mismatch sets - keys1 = set(df1["__key__"].dropna()) - keys2 = set(df2["__key__"].dropna()) + keys1 = set(df1["__key__"]) + keys2 = set(df2["__key__"]) match_keys = keys1 & keys2 only1_keys = keys1 - keys2 only2_keys = keys2 - keys1 - # Filter full rows - df_match1 = df1[df1["__key__"].isin(match_keys)].drop(columns="__key__") + df_match1 = df1[df1["__key__"].isin(match_keys)].copy() + df_match2 = df2[df2["__key__"].isin(match_keys)].copy() df_only1 = df1[df1["__key__"].isin(only1_keys)].drop(columns="__key__") - df_match2 = df2[df2["__key__"].isin(match_keys)].drop(columns="__key__") df_only2 = df2[df2["__key__"].isin(only2_keys)].drop(columns="__key__") - st.success(f"โœ… Matches: {len(df_match1)} rows") + st.success(f"โœ… Matches: {len(match_keys)} rows") st.info(f"๐Ÿ“ Only in File 1: {len(df_only1)} rows") st.info(f"๐Ÿ“ Only in File 2: {len(df_only2)} rows") - # Show expandable results - with st.expander("๐ŸŽฏ Matched Rows from File 1"): - st.dataframe(df_match1) - with st.expander("๐ŸŽฏ Matched Rows from File 2"): - st.dataframe(df_match2) + merged = pd.DataFrame() + + if show_merge: + df_match1.set_index("__key__", inplace=True) + df_match2.set_index("__key__", inplace=True) + + merged = pd.merge(df_match1.add_prefix("F1_"), df_match2.add_prefix("F2_"), + left_index=True, right_index=True, how="inner").reset_index() + merged.rename(columns={"__key__": "Match_Key"}, inplace=True) + + with st.expander("๐ŸŽฏ Merged Matched Rows"): + st.dataframe(merged) + + if highlight_diffs: + def highlight_diff(row): + styles = [] + for col in row.index: + if col.startswith("F1_"): + match_col = col.replace("F1_", "F2_") + if match_col in row: + if row[col] != row[match_col]: + styles.append("background-color: #ffcccc") + else: + styles.append("background-color: #ccffcc") + else: + styles.append("") + else: + styles.append("") + return styles + + st.subheader("๐ŸŽจ Highlighted Differences") + st.dataframe(merged.style.apply(highlight_diff, axis=1)) + with st.expander("โŒ Only in File 1"): st.dataframe(df_only1) + with st.expander("โŒ Only in File 2"): st.dataframe(df_only2) - # Export as Excel - def create_excel_file(): - output = io.BytesIO() - with pd.ExcelWriter(output, engine="openpyxl") as writer: - df_match1.to_excel(writer, sheet_name="Matched_File1", index=False) - df_match2.to_excel(writer, sheet_name="Matched_File2", index=False) + def create_excel(): + buffer = io.BytesIO() + with pd.ExcelWriter(buffer, engine='openpyxl') as writer: + if show_merge: + merged.to_excel(writer, sheet_name="Matched_SideBySide", index=False) df_only1.to_excel(writer, sheet_name="Only_in_File1", index=False) df_only2.to_excel(writer, sheet_name="Only_in_File2", index=False) - output.seek(0) - return output + buffer.seek(0) + return buffer - excel_output = create_excel_file() - - st.download_button("โฌ‡๏ธ Download Results as Excel", excel_output, - file_name="comparison_full_rows.xlsx", + st.download_button("โฌ‡๏ธ Download All Results (Excel)", + create_excel(), + file_name="comparison_results.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") - - # Optional CSVs - st.download_button("โฌ‡๏ธ CSV: Only in File 1", df_only1.to_csv(index=False), "only_in_file1.csv", "text/csv") - st.download_button("โฌ‡๏ธ CSV: Only in File 2", df_only2.to_csv(index=False), "only_in_file2.csv", "text/csv") - st.download_button("โฌ‡๏ธ CSV: Matched Rows File 1", df_match1.to_csv(index=False), "matched_file1.csv", "text/csv") - st.download_button("โฌ‡๏ธ CSV: Matched Rows File 2", df_match2.to_csv(index=False), "matched_file2.csv", "text/csv")