okta and workday scripts

2026-01-26 16:49:09 -05:00
parent 01c3f51dac
commit 0d7ff5b63e
25 changed files with 148270 additions and 0 deletions
--- a/workday_monthly_admin_activities_audit/bkup-process.py
+++ b/workday_monthly_admin_activities_audit/bkup-process.py
@@ -0,0 +1,326 @@
+# process.py
+import os
+import sys
+import json
+import re
+import hashlib
+from collections import Counter, defaultdict
+
+import requests
+import pandas as pd
+import spacy
+from openai import OpenAI
+from tqdm import tqdm
+
+# =========================
+# Configuration
+# =========================
+#DEFAULT_LM_IP = "192.168.1.221"  # default LM Studio host (without /v1)
+DEFAULT_LM_IP = "10.81.209.99"  # default LM Studio host (without /v1)
+LLM_MODEL = "openai/gpt-oss-20b"
+LLM_API_KEY = "not-needed"  # LM Studio typically doesn't require an API key
+
+INPUT_CSV  = "test.csv"
+OUTPUT_CSV = "test_with_names.csv"
+EVENT_LOG  = "event_log.txt"
+
+# Columns to process
+SOURCE_COL_1 = "Instance that Changed"
+TARGET_COL_1 = "Applied to"
+
+SOURCE_COL_2 = "Added"
+TARGET_COL_2 = "Added Applied to"
+
+ENTERED_COL = "Entered On"
+ENTERED_MMDD_COL = "Entered On (MM/DD)"
+
+# Values to ignore entirely (case-insensitive)
+AUTO_STRINGS = {"automatic complete"}
+
+def is_auto(val) -> bool:
+    return isinstance(val, str) and val.strip().lower() in AUTO_STRINGS
+
+# Regex helpers
+DELIM_SPLIT = re.compile(r"\s*[\/|\-–—]\s*")
+KEEP_CHARS  = re.compile(r"[^A-Za-zÀ-ÖØ-öø-ÿ' .\-]")
+
+def clean_person(text: str) -> str:
+    """Clean extracted name by removing job codes/fragments after dashes/slashes; keep name-ish chars."""
+    if not text:
+        return ""
+    first = DELIM_SPLIT.split(text, maxsplit=1)[0]
+    first = KEEP_CHARS.sub("", first).strip()
+    return re.sub(r"\s{2,}", " ", first)
+
+# =========================
+# LM Studio reachability
+# =========================
+def check_lmstudio(ip: str) -> str:
+    """
+    Ensure LM Studio endpoint is reachable; if not, prompt for IP until it is.
+    Returns the validated base URL like "http://<ip>:1234/v1".
+    """
+    def _ok(url: str) -> bool:
+        try:
+            r = requests.get(url.rstrip("/") + "/models", timeout=5)
+            return r.status_code == 200
+        except Exception:
+            return False
+
+    base_url = f"http://{ip}:1234/v1"
+    if _ok(base_url):
+        print(f"✅ LM Studio reachable at {base_url}")
+        return base_url
+
+    print(f"❌ Could not reach LM Studio at {base_url}")
+    while True:
+        new_ip = input("Enter LM Studio IP address (e.g. 192.168.1.221): ").strip()
+        if not new_ip:
+            print("Aborted: No IP provided.")
+            sys.exit(1)
+        base_url = f"http://{new_ip}:1234/v1"
+        print(f"🔍 Retesting {base_url}...")
+        if _ok(base_url):
+            print(f"✅ LM Studio reachable at {base_url}")
+            return base_url
+        else:
+            print("❌ Still unreachable. Try again or Ctrl+C to exit.")
+
+# Perform reachability check BEFORE any processing
+LLM_BASE_URL = check_lmstudio(DEFAULT_LM_IP)
+client = OpenAI(base_url=LLM_BASE_URL, api_key=LLM_API_KEY)
+
+# =========================
+# spaCy model (Transformer)
+# =========================
+print("🔍 Loading spaCy transformer model: en_core_web_trf")
+nlp = spacy.load(
+    "en_core_web_trf",
+    exclude=["parser", "tagger", "attribute_ruler", "lemmatizer", "morphologizer"],
+)
+print("✅ spaCy model loaded successfully.")
+
+def extract_names(text: str) -> str:
+    """Extract distinct PERSON names using spaCy Transformer model."""
+    if not isinstance(text, str) or not text.strip():
+        return ""
+    doc = nlp(text)
+    names, seen = [], set()
+    for ent in doc.ents:
+        if ent.label_ == "PERSON":
+            cleaned = clean_person(ent.text)
+            key = cleaned.lower()
+            if cleaned and key not in seen:
+                seen.add(key)
+                names.append(cleaned)
+    return ", ".join(names)
+
+def insert_after(df: pd.DataFrame, after_col: str, new_col: str, values: pd.Series) -> None:
+    """Insert new_col immediately after after_col (drop existing if present)."""
+    if new_col in df.columns:
+        df.drop(columns=[new_col], inplace=True)
+    idx = df.columns.get_loc(after_col) + 1
+    df.insert(idx, new_col, values)
+
+def dataframe_to_compact_event(df: pd.DataFrame) -> str:
+    """Compact JSON payload for a grouped event (keeps unique values per column)."""
+    def uniq(col):
+        return sorted([v for v in df[col].dropna().unique().tolist()]) if col in df else []
+    payload = {
+        "applied_to":     uniq(TARGET_COL_1),
+        "by_user":        uniq("By User"),
+        "in_transaction": uniq("In Transaction"),
+        "entered_on":     uniq(ENTERED_COL),
+        "dates_mmdd":     uniq(ENTERED_MMDD_COL),
+        "instances":      uniq(SOURCE_COL_1),
+        "added":          uniq(SOURCE_COL_2),
+        "row_count":      int(len(df)),
+    }
+    return json.dumps(payload, ensure_ascii=False, indent=2)
+
+# =========================
+# Main flow
+# =========================
+
+# If processed CSV already exists, skip straight to summarization
+if os.path.exists(OUTPUT_CSV):
+    print(f"⚡ Skipping CSV processing — {OUTPUT_CSV} already exists.")
+    df = pd.read_csv(OUTPUT_CSV)
+    # Ensure MM/DD exists (for old CSVs)
+    if ENTERED_MMDD_COL not in df.columns and ENTERED_COL in df.columns:
+        ts = pd.to_datetime(df[ENTERED_COL], errors="coerce")
+        df[ENTERED_MMDD_COL] = ts.dt.strftime("%m/%d").fillna("")
+else:
+    print("⚙️ Processing CSV to extract names and generate output...")
+
+    # Load CSV
+    df = pd.read_csv(INPUT_CSV)
+
+    # Derive Entered On (MM/DD)
+    if ENTERED_COL in df.columns:
+        try:
+            ts = pd.to_datetime(df[ENTERED_COL], format="mixed", errors="coerce")
+        except TypeError:
+            ts = pd.to_datetime(df[ENTERED_COL], errors="coerce")
+        df[ENTERED_MMDD_COL] = ts.dt.strftime("%m/%d").fillna("")
+    else:
+        df[ENTERED_MMDD_COL] = ""
+
+    # Live progress counters for names across both columns
+    name_counter = Counter()
+
+    def _process_series_with_progress(series: pd.Series, desc: str) -> pd.Series:
+        """Iterate with progress, update name_counter, and return extracted names Series."""
+        values = series.fillna("").astype(str).tolist()
+        out = []
+        total = len(values)
+        if total == 0:
+            return pd.Series([], dtype=object)
+        step = max(10, total // 20)  # update ~every 5% (at least every 10 rows)
+        pbar = tqdm(values, desc=f"NER: {desc}", leave=True)
+        for i, text in enumerate(pbar, start=1):
+            names = extract_names(text)
+            # Update running totals (ignore "Automatic Complete")
+            for n in [x.strip() for x in names.split(",") if x.strip()]:
+                if n.lower() not in AUTO_STRINGS:
+                    name_counter[n] += 1
+            out.append(names)
+            # Periodic status refresh
+            if i % step == 0 or i == total:
+                top = ", ".join(f"{n}:{c}" for n, c in name_counter.most_common(3))
+                pbar.set_postfix_str(f"unique={len(name_counter)} top=[{top}]")
+        return pd.Series(out, index=series.index, dtype=object)
+
+    # 1) Extract from "Instance that Changed" -> "Applied to"
+    if SOURCE_COL_1 in df.columns:
+        applied_series = _process_series_with_progress(df[SOURCE_COL_1], SOURCE_COL_1)
+        insert_after(df, SOURCE_COL_1, TARGET_COL_1, applied_series)
+    else:
+        df[TARGET_COL_1] = ""
+
+    # 1a) Simplified quick-fill:
+    # If "Applied to" has a value, always copy it to "Added Applied to"
+    if SOURCE_COL_2 in df.columns:
+      if TARGET_COL_2 not in df.columns:
+        df[TARGET_COL_2] = ""
+      for i, row in df.iterrows():
+        name = str(row.get(TARGET_COL_1, "")).strip()
+        aat = str(row.get(TARGET_COL_2, "")).strip()
+        if name and not aat:
+            df.at[i, TARGET_COL_2] = name
+    else:
+      df[TARGET_COL_2] = ""
+
+    # 2) Extract from "Added" -> "Added Applied to" (skip rows with value already set OR empty Added)
+    if SOURCE_COL_2 in df.columns:
+        mask_need = (df[TARGET_COL_2].fillna("").str.strip() == "") & (df[SOURCE_COL_2].fillna("").str.strip() != "")
+        idxs = df.index[mask_need].tolist()
+        if idxs:
+            values = df.loc[idxs, SOURCE_COL_2]
+            pbar = tqdm(values.tolist(), desc=f"NER: {SOURCE_COL_2} (remaining)", leave=True)
+            extracted = []
+            for text in pbar:
+                names = extract_names(text)
+                # update counter (ignore "Automatic Complete")
+                for n in [x.strip() for x in names.split(",") if x.strip()]:
+                    if n.lower() not in AUTO_STRINGS:
+                        name_counter[n] += 1
+                extracted.append(names)
+            df.loc[idxs, TARGET_COL_2] = extracted
+
+    # --- Remove any rows that are purely "Automatic Complete" in key fields ---
+    for col in [SOURCE_COL_1, SOURCE_COL_2, "In Transaction"]:
+        if col in df.columns:
+            df = df[~df[col].apply(is_auto)]
+
+    # --- Keep only selected columns (incl. MM/DD) ---
+    keep_cols = [
+        SOURCE_COL_1,
+        TARGET_COL_1,
+        "In Transaction",
+        SOURCE_COL_2,
+        TARGET_COL_2,
+        "By User",
+        ENTERED_COL,
+        ENTERED_MMDD_COL,
+    ]
+    df = df[[c for c in keep_cols if c in df.columns]]
+
+    # --- Filter rows: keep where Applied to == Added Applied to (case-insensitive) ---
+    if TARGET_COL_1 in df.columns and TARGET_COL_2 in df.columns:
+        df = df[
+            df[TARGET_COL_1].fillna("").str.strip().str.lower()
+            == df[TARGET_COL_2].fillna("").str.strip().str.lower()
+        ]
+
+    # --- Drop duplicates & save overall result ---
+    df = df.drop_duplicates().reset_index(drop=True)
+    df.to_csv(OUTPUT_CSV, index=False)
+    print(f"✅ Saved {len(df)} unique matching rows to {OUTPUT_CSV}")
+
+# =========================
+# LM Studio event summary generation (group by By User, then date asc)
+# =========================
+if not df.empty:
+    grouped = df.groupby([TARGET_COL_1, "By User", ENTERED_COL], dropna=False)
+    summaries = []  # list of tuples (by_user, mmdd, sentence)
+
+    for keys, gdf in grouped:
+        applied_to, by_user, entered_on = keys
+        if not applied_to or str(applied_to).strip() == "":
+            continue
+
+        mmdd_vals = gdf[ENTERED_MMDD_COL].dropna().astype(str)
+        mmdd = next((v for v in mmdd_vals if v.strip()), "")
+
+        payload = dataframe_to_compact_event(gdf)
+
+        prompt = (
+            "You are a compliance and information security analyst. "
+            "Given the following grouped audit data, produce ONE clear and concise sentence summarizing the event. "
+            "Include: (1) who performed the action (By User, include name and ID if available), "
+            "(2) who the change applied to (Applied to), "
+            "(3) the full list of role names that were assigned or added (from 'Instance that Changed' and 'Added'), "
+            "and (4) the date of the event. "
+            "Always mention the specific role titles exactly as shown in the data. "
+            "If multiple roles were assigned, list them all in a natural phrase like "
+            "'assigned the A, B, and C roles'. "
+            "Do not include raw JSON, extra commentary, or line breaks. Return only one sentence.\n\n"
+            f"Audit Data (JSON):\n{payload}"
+        )
+
+        try:
+            resp = client.chat.completions.create(
+                model=LLM_MODEL,
+                messages=[
+                    {"role": "system", "content": "You write terse, clear compliance summaries."},
+                    {"role": "user", "content": prompt},
+                ],
+                temperature=0.2,
+            )
+            one_liner = (resp.choices[0].message.content or "").strip()
+        except Exception as e:
+            one_liner = f"[LLM ERROR] {e}"
+
+        summaries.append((by_user or "Unknown User", mmdd, one_liner))
+
+    # Group by By User, sort each user's entries by mm/dd asc, write file (OVERWRITE)
+    grouped_summaries: dict[str, list[tuple[str, str]]] = defaultdict(list)
+    for by_user, mmdd, line in summaries:
+        grouped_summaries[by_user].append((mmdd, line))
+
+    for user in grouped_summaries:
+        grouped_summaries[user].sort(key=lambda x: x[0] or "")
+
+    with open(EVENT_LOG, "w", encoding="utf-8") as f:
+        for user in sorted(grouped_summaries.keys()):
+            f.write(f"=== {user} ===\n")
+            for mmdd, line in grouped_summaries[user]:
+                prefix = f"{mmdd} - " if mmdd else ""
+                f.write(f"{prefix}{line}\n")
+            f.write("\n")
+
+    total_events = sum(len(v) for v in grouped_summaries.values())
+    print(f"📝 Overwrote {EVENT_LOG} with {total_events} grouped event summaries")
+else:
+    print("ℹ️ No matching rows found; nothing to summarize.")