# process.py import os import sys import json import re import hashlib from collections import Counter, defaultdict import requests import pandas as pd import spacy from openai import OpenAI from tqdm import tqdm # ========================= # Configuration # ========================= #DEFAULT_LM_IP = "192.168.1.221" # default LM Studio host (without /v1) DEFAULT_LM_IP = "10.81.209.99" # default LM Studio host (without /v1) LLM_MODEL = "openai/gpt-oss-20b" LLM_API_KEY = "not-needed" # LM Studio typically doesn't require an API key INPUT_CSV = "test.csv" OUTPUT_CSV = "test_with_names.csv" EVENT_LOG = "event_log.txt" # Columns to process SOURCE_COL_1 = "Instance that Changed" TARGET_COL_1 = "Applied to" SOURCE_COL_2 = "Added" TARGET_COL_2 = "Added Applied to" ENTERED_COL = "Entered On" ENTERED_MMDD_COL = "Entered On (MM/DD)" # Values to ignore entirely (case-insensitive) AUTO_STRINGS = {"automatic complete"} def is_auto(val) -> bool: return isinstance(val, str) and val.strip().lower() in AUTO_STRINGS # Regex helpers DELIM_SPLIT = re.compile(r"\s*[\/|\-–—]\s*") KEEP_CHARS = re.compile(r"[^A-Za-zÀ-ÖØ-öø-ÿ' .\-]") def clean_person(text: str) -> str: """Clean extracted name by removing job codes/fragments after dashes/slashes; keep name-ish chars.""" if not text: return "" first = DELIM_SPLIT.split(text, maxsplit=1)[0] first = KEEP_CHARS.sub("", first).strip() return re.sub(r"\s{2,}", " ", first) # ========================= # LM Studio reachability # ========================= def check_lmstudio(ip: str) -> str: """ Ensure LM Studio endpoint is reachable; if not, prompt for IP until it is. Returns the validated base URL like "http://:1234/v1". """ def _ok(url: str) -> bool: try: r = requests.get(url.rstrip("/") + "/models", timeout=5) return r.status_code == 200 except Exception: return False base_url = f"http://{ip}:1234/v1" if _ok(base_url): print(f"✅ LM Studio reachable at {base_url}") return base_url print(f"❌ Could not reach LM Studio at {base_url}") while True: new_ip = input("Enter LM Studio IP address (e.g. 192.168.1.221): ").strip() if not new_ip: print("Aborted: No IP provided.") sys.exit(1) base_url = f"http://{new_ip}:1234/v1" print(f"🔍 Retesting {base_url}...") if _ok(base_url): print(f"✅ LM Studio reachable at {base_url}") return base_url else: print("❌ Still unreachable. Try again or Ctrl+C to exit.") # Perform reachability check BEFORE any processing LLM_BASE_URL = check_lmstudio(DEFAULT_LM_IP) client = OpenAI(base_url=LLM_BASE_URL, api_key=LLM_API_KEY) # ========================= # spaCy model (Transformer) # ========================= print("🔍 Loading spaCy transformer model: en_core_web_trf") nlp = spacy.load( "en_core_web_trf", exclude=["parser", "tagger", "attribute_ruler", "lemmatizer", "morphologizer"], ) print("✅ spaCy model loaded successfully.") def extract_names(text: str) -> str: """Extract distinct PERSON names using spaCy Transformer model.""" if not isinstance(text, str) or not text.strip(): return "" doc = nlp(text) names, seen = [], set() for ent in doc.ents: if ent.label_ == "PERSON": cleaned = clean_person(ent.text) key = cleaned.lower() if cleaned and key not in seen: seen.add(key) names.append(cleaned) return ", ".join(names) def insert_after(df: pd.DataFrame, after_col: str, new_col: str, values: pd.Series) -> None: """Insert new_col immediately after after_col (drop existing if present).""" if new_col in df.columns: df.drop(columns=[new_col], inplace=True) idx = df.columns.get_loc(after_col) + 1 df.insert(idx, new_col, values) def dataframe_to_compact_event(df: pd.DataFrame) -> str: """Compact JSON payload for a grouped event (keeps unique values per column).""" def uniq(col): return sorted([v for v in df[col].dropna().unique().tolist()]) if col in df else [] payload = { "applied_to": uniq(TARGET_COL_1), "by_user": uniq("By User"), "in_transaction": uniq("In Transaction"), "entered_on": uniq(ENTERED_COL), "dates_mmdd": uniq(ENTERED_MMDD_COL), "instances": uniq(SOURCE_COL_1), "added": uniq(SOURCE_COL_2), "row_count": int(len(df)), } return json.dumps(payload, ensure_ascii=False, indent=2) # ========================= # Main flow # ========================= # If processed CSV already exists, skip straight to summarization if os.path.exists(OUTPUT_CSV): print(f"⚡ Skipping CSV processing — {OUTPUT_CSV} already exists.") df = pd.read_csv(OUTPUT_CSV) # Ensure MM/DD exists (for old CSVs) if ENTERED_MMDD_COL not in df.columns and ENTERED_COL in df.columns: ts = pd.to_datetime(df[ENTERED_COL], errors="coerce") df[ENTERED_MMDD_COL] = ts.dt.strftime("%m/%d").fillna("") else: print("⚙️ Processing CSV to extract names and generate output...") # Load CSV df = pd.read_csv(INPUT_CSV) # Derive Entered On (MM/DD) if ENTERED_COL in df.columns: try: ts = pd.to_datetime(df[ENTERED_COL], format="mixed", errors="coerce") except TypeError: ts = pd.to_datetime(df[ENTERED_COL], errors="coerce") df[ENTERED_MMDD_COL] = ts.dt.strftime("%m/%d").fillna("") else: df[ENTERED_MMDD_COL] = "" # Live progress counters for names across both columns name_counter = Counter() def _process_series_with_progress(series: pd.Series, desc: str) -> pd.Series: """Iterate with progress, update name_counter, and return extracted names Series.""" values = series.fillna("").astype(str).tolist() out = [] total = len(values) if total == 0: return pd.Series([], dtype=object) step = max(10, total // 20) # update ~every 5% (at least every 10 rows) pbar = tqdm(values, desc=f"NER: {desc}", leave=True) for i, text in enumerate(pbar, start=1): names = extract_names(text) # Update running totals (ignore "Automatic Complete") for n in [x.strip() for x in names.split(",") if x.strip()]: if n.lower() not in AUTO_STRINGS: name_counter[n] += 1 out.append(names) # Periodic status refresh if i % step == 0 or i == total: top = ", ".join(f"{n}:{c}" for n, c in name_counter.most_common(3)) pbar.set_postfix_str(f"unique={len(name_counter)} top=[{top}]") return pd.Series(out, index=series.index, dtype=object) # 1) Extract from "Instance that Changed" -> "Applied to" if SOURCE_COL_1 in df.columns: applied_series = _process_series_with_progress(df[SOURCE_COL_1], SOURCE_COL_1) insert_after(df, SOURCE_COL_1, TARGET_COL_1, applied_series) else: df[TARGET_COL_1] = "" # 1a) Simplified quick-fill: # If "Applied to" has a value, always copy it to "Added Applied to" if SOURCE_COL_2 in df.columns: if TARGET_COL_2 not in df.columns: df[TARGET_COL_2] = "" for i, row in df.iterrows(): name = str(row.get(TARGET_COL_1, "")).strip() aat = str(row.get(TARGET_COL_2, "")).strip() if name and not aat: df.at[i, TARGET_COL_2] = name else: df[TARGET_COL_2] = "" # 2) Extract from "Added" -> "Added Applied to" (skip rows with value already set OR empty Added) if SOURCE_COL_2 in df.columns: mask_need = (df[TARGET_COL_2].fillna("").str.strip() == "") & (df[SOURCE_COL_2].fillna("").str.strip() != "") idxs = df.index[mask_need].tolist() if idxs: values = df.loc[idxs, SOURCE_COL_2] pbar = tqdm(values.tolist(), desc=f"NER: {SOURCE_COL_2} (remaining)", leave=True) extracted = [] for text in pbar: names = extract_names(text) # update counter (ignore "Automatic Complete") for n in [x.strip() for x in names.split(",") if x.strip()]: if n.lower() not in AUTO_STRINGS: name_counter[n] += 1 extracted.append(names) df.loc[idxs, TARGET_COL_2] = extracted # --- Remove any rows that are purely "Automatic Complete" in key fields --- for col in [SOURCE_COL_1, SOURCE_COL_2, "In Transaction"]: if col in df.columns: df = df[~df[col].apply(is_auto)] # --- Keep only selected columns (incl. MM/DD) --- keep_cols = [ SOURCE_COL_1, TARGET_COL_1, "In Transaction", SOURCE_COL_2, TARGET_COL_2, "By User", ENTERED_COL, ENTERED_MMDD_COL, ] df = df[[c for c in keep_cols if c in df.columns]] # --- Filter rows: keep where Applied to == Added Applied to (case-insensitive) --- if TARGET_COL_1 in df.columns and TARGET_COL_2 in df.columns: df = df[ df[TARGET_COL_1].fillna("").str.strip().str.lower() == df[TARGET_COL_2].fillna("").str.strip().str.lower() ] # --- Drop duplicates & save overall result --- df = df.drop_duplicates().reset_index(drop=True) df.to_csv(OUTPUT_CSV, index=False) print(f"✅ Saved {len(df)} unique matching rows to {OUTPUT_CSV}") # ========================= # LM Studio event summary generation (group by By User, then date asc) # ========================= if not df.empty: grouped = df.groupby([TARGET_COL_1, "By User", ENTERED_COL], dropna=False) summaries = [] # list of tuples (by_user, mmdd, sentence) for keys, gdf in grouped: applied_to, by_user, entered_on = keys if not applied_to or str(applied_to).strip() == "": continue mmdd_vals = gdf[ENTERED_MMDD_COL].dropna().astype(str) mmdd = next((v for v in mmdd_vals if v.strip()), "") payload = dataframe_to_compact_event(gdf) prompt = ( "You are a compliance and information security analyst. " "Given the following grouped audit data, produce ONE clear and concise sentence summarizing the event. " "Include: (1) who performed the action (By User, include name and ID if available), " "(2) who the change applied to (Applied to), " "(3) the full list of role names that were assigned or added (from 'Instance that Changed' and 'Added'), " "and (4) the date of the event. " "Always mention the specific role titles exactly as shown in the data. " "If multiple roles were assigned, list them all in a natural phrase like " "'assigned the A, B, and C roles'. " "Do not include raw JSON, extra commentary, or line breaks. Return only one sentence.\n\n" f"Audit Data (JSON):\n{payload}" ) try: resp = client.chat.completions.create( model=LLM_MODEL, messages=[ {"role": "system", "content": "You write terse, clear compliance summaries."}, {"role": "user", "content": prompt}, ], temperature=0.2, ) one_liner = (resp.choices[0].message.content or "").strip() except Exception as e: one_liner = f"[LLM ERROR] {e}" summaries.append((by_user or "Unknown User", mmdd, one_liner)) # Group by By User, sort each user's entries by mm/dd asc, write file (OVERWRITE) grouped_summaries: dict[str, list[tuple[str, str]]] = defaultdict(list) for by_user, mmdd, line in summaries: grouped_summaries[by_user].append((mmdd, line)) for user in grouped_summaries: grouped_summaries[user].sort(key=lambda x: x[0] or "") with open(EVENT_LOG, "w", encoding="utf-8") as f: for user in sorted(grouped_summaries.keys()): f.write(f"=== {user} ===\n") for mmdd, line in grouped_summaries[user]: prefix = f"{mmdd} - " if mmdd else "" f.write(f"{prefix}{line}\n") f.write("\n") total_events = sum(len(v) for v in grouped_summaries.values()) print(f"📝 Overwrote {EVENT_LOG} with {total_events} grouped event summaries") else: print("ℹ️ No matching rows found; nothing to summarize.")