okta and workday scripts

This commit is contained in:
2026-01-26 16:49:09 -05:00
parent 01c3f51dac
commit 0d7ff5b63e
25 changed files with 148270 additions and 0 deletions

View File

@@ -0,0 +1,326 @@
# process.py
import os
import sys
import json
import re
import hashlib
from collections import Counter, defaultdict
import requests
import pandas as pd
import spacy
from openai import OpenAI
from tqdm import tqdm
# =========================
# Configuration
# =========================
#DEFAULT_LM_IP = "192.168.1.221" # default LM Studio host (without /v1)
DEFAULT_LM_IP = "10.81.209.99" # default LM Studio host (without /v1)
LLM_MODEL = "openai/gpt-oss-20b"
LLM_API_KEY = "not-needed" # LM Studio typically doesn't require an API key
INPUT_CSV = "test.csv"
OUTPUT_CSV = "test_with_names.csv"
EVENT_LOG = "event_log.txt"
# Columns to process
SOURCE_COL_1 = "Instance that Changed"
TARGET_COL_1 = "Applied to"
SOURCE_COL_2 = "Added"
TARGET_COL_2 = "Added Applied to"
ENTERED_COL = "Entered On"
ENTERED_MMDD_COL = "Entered On (MM/DD)"
# Values to ignore entirely (case-insensitive)
AUTO_STRINGS = {"automatic complete"}
def is_auto(val) -> bool:
return isinstance(val, str) and val.strip().lower() in AUTO_STRINGS
# Regex helpers
DELIM_SPLIT = re.compile(r"\s*[\/|\-–—]\s*")
KEEP_CHARS = re.compile(r"[^A-Za-zÀ-ÖØ-öø-ÿ' .\-]")
def clean_person(text: str) -> str:
"""Clean extracted name by removing job codes/fragments after dashes/slashes; keep name-ish chars."""
if not text:
return ""
first = DELIM_SPLIT.split(text, maxsplit=1)[0]
first = KEEP_CHARS.sub("", first).strip()
return re.sub(r"\s{2,}", " ", first)
# =========================
# LM Studio reachability
# =========================
def check_lmstudio(ip: str) -> str:
"""
Ensure LM Studio endpoint is reachable; if not, prompt for IP until it is.
Returns the validated base URL like "http://<ip>:1234/v1".
"""
def _ok(url: str) -> bool:
try:
r = requests.get(url.rstrip("/") + "/models", timeout=5)
return r.status_code == 200
except Exception:
return False
base_url = f"http://{ip}:1234/v1"
if _ok(base_url):
print(f"✅ LM Studio reachable at {base_url}")
return base_url
print(f"❌ Could not reach LM Studio at {base_url}")
while True:
new_ip = input("Enter LM Studio IP address (e.g. 192.168.1.221): ").strip()
if not new_ip:
print("Aborted: No IP provided.")
sys.exit(1)
base_url = f"http://{new_ip}:1234/v1"
print(f"🔍 Retesting {base_url}...")
if _ok(base_url):
print(f"✅ LM Studio reachable at {base_url}")
return base_url
else:
print("❌ Still unreachable. Try again or Ctrl+C to exit.")
# Perform reachability check BEFORE any processing
LLM_BASE_URL = check_lmstudio(DEFAULT_LM_IP)
client = OpenAI(base_url=LLM_BASE_URL, api_key=LLM_API_KEY)
# =========================
# spaCy model (Transformer)
# =========================
print("🔍 Loading spaCy transformer model: en_core_web_trf")
nlp = spacy.load(
"en_core_web_trf",
exclude=["parser", "tagger", "attribute_ruler", "lemmatizer", "morphologizer"],
)
print("✅ spaCy model loaded successfully.")
def extract_names(text: str) -> str:
"""Extract distinct PERSON names using spaCy Transformer model."""
if not isinstance(text, str) or not text.strip():
return ""
doc = nlp(text)
names, seen = [], set()
for ent in doc.ents:
if ent.label_ == "PERSON":
cleaned = clean_person(ent.text)
key = cleaned.lower()
if cleaned and key not in seen:
seen.add(key)
names.append(cleaned)
return ", ".join(names)
def insert_after(df: pd.DataFrame, after_col: str, new_col: str, values: pd.Series) -> None:
"""Insert new_col immediately after after_col (drop existing if present)."""
if new_col in df.columns:
df.drop(columns=[new_col], inplace=True)
idx = df.columns.get_loc(after_col) + 1
df.insert(idx, new_col, values)
def dataframe_to_compact_event(df: pd.DataFrame) -> str:
"""Compact JSON payload for a grouped event (keeps unique values per column)."""
def uniq(col):
return sorted([v for v in df[col].dropna().unique().tolist()]) if col in df else []
payload = {
"applied_to": uniq(TARGET_COL_1),
"by_user": uniq("By User"),
"in_transaction": uniq("In Transaction"),
"entered_on": uniq(ENTERED_COL),
"dates_mmdd": uniq(ENTERED_MMDD_COL),
"instances": uniq(SOURCE_COL_1),
"added": uniq(SOURCE_COL_2),
"row_count": int(len(df)),
}
return json.dumps(payload, ensure_ascii=False, indent=2)
# =========================
# Main flow
# =========================
# If processed CSV already exists, skip straight to summarization
if os.path.exists(OUTPUT_CSV):
print(f"⚡ Skipping CSV processing — {OUTPUT_CSV} already exists.")
df = pd.read_csv(OUTPUT_CSV)
# Ensure MM/DD exists (for old CSVs)
if ENTERED_MMDD_COL not in df.columns and ENTERED_COL in df.columns:
ts = pd.to_datetime(df[ENTERED_COL], errors="coerce")
df[ENTERED_MMDD_COL] = ts.dt.strftime("%m/%d").fillna("")
else:
print("⚙️ Processing CSV to extract names and generate output...")
# Load CSV
df = pd.read_csv(INPUT_CSV)
# Derive Entered On (MM/DD)
if ENTERED_COL in df.columns:
try:
ts = pd.to_datetime(df[ENTERED_COL], format="mixed", errors="coerce")
except TypeError:
ts = pd.to_datetime(df[ENTERED_COL], errors="coerce")
df[ENTERED_MMDD_COL] = ts.dt.strftime("%m/%d").fillna("")
else:
df[ENTERED_MMDD_COL] = ""
# Live progress counters for names across both columns
name_counter = Counter()
def _process_series_with_progress(series: pd.Series, desc: str) -> pd.Series:
"""Iterate with progress, update name_counter, and return extracted names Series."""
values = series.fillna("").astype(str).tolist()
out = []
total = len(values)
if total == 0:
return pd.Series([], dtype=object)
step = max(10, total // 20) # update ~every 5% (at least every 10 rows)
pbar = tqdm(values, desc=f"NER: {desc}", leave=True)
for i, text in enumerate(pbar, start=1):
names = extract_names(text)
# Update running totals (ignore "Automatic Complete")
for n in [x.strip() for x in names.split(",") if x.strip()]:
if n.lower() not in AUTO_STRINGS:
name_counter[n] += 1
out.append(names)
# Periodic status refresh
if i % step == 0 or i == total:
top = ", ".join(f"{n}:{c}" for n, c in name_counter.most_common(3))
pbar.set_postfix_str(f"unique={len(name_counter)} top=[{top}]")
return pd.Series(out, index=series.index, dtype=object)
# 1) Extract from "Instance that Changed" -> "Applied to"
if SOURCE_COL_1 in df.columns:
applied_series = _process_series_with_progress(df[SOURCE_COL_1], SOURCE_COL_1)
insert_after(df, SOURCE_COL_1, TARGET_COL_1, applied_series)
else:
df[TARGET_COL_1] = ""
# 1a) Simplified quick-fill:
# If "Applied to" has a value, always copy it to "Added Applied to"
if SOURCE_COL_2 in df.columns:
if TARGET_COL_2 not in df.columns:
df[TARGET_COL_2] = ""
for i, row in df.iterrows():
name = str(row.get(TARGET_COL_1, "")).strip()
aat = str(row.get(TARGET_COL_2, "")).strip()
if name and not aat:
df.at[i, TARGET_COL_2] = name
else:
df[TARGET_COL_2] = ""
# 2) Extract from "Added" -> "Added Applied to" (skip rows with value already set OR empty Added)
if SOURCE_COL_2 in df.columns:
mask_need = (df[TARGET_COL_2].fillna("").str.strip() == "") & (df[SOURCE_COL_2].fillna("").str.strip() != "")
idxs = df.index[mask_need].tolist()
if idxs:
values = df.loc[idxs, SOURCE_COL_2]
pbar = tqdm(values.tolist(), desc=f"NER: {SOURCE_COL_2} (remaining)", leave=True)
extracted = []
for text in pbar:
names = extract_names(text)
# update counter (ignore "Automatic Complete")
for n in [x.strip() for x in names.split(",") if x.strip()]:
if n.lower() not in AUTO_STRINGS:
name_counter[n] += 1
extracted.append(names)
df.loc[idxs, TARGET_COL_2] = extracted
# --- Remove any rows that are purely "Automatic Complete" in key fields ---
for col in [SOURCE_COL_1, SOURCE_COL_2, "In Transaction"]:
if col in df.columns:
df = df[~df[col].apply(is_auto)]
# --- Keep only selected columns (incl. MM/DD) ---
keep_cols = [
SOURCE_COL_1,
TARGET_COL_1,
"In Transaction",
SOURCE_COL_2,
TARGET_COL_2,
"By User",
ENTERED_COL,
ENTERED_MMDD_COL,
]
df = df[[c for c in keep_cols if c in df.columns]]
# --- Filter rows: keep where Applied to == Added Applied to (case-insensitive) ---
if TARGET_COL_1 in df.columns and TARGET_COL_2 in df.columns:
df = df[
df[TARGET_COL_1].fillna("").str.strip().str.lower()
== df[TARGET_COL_2].fillna("").str.strip().str.lower()
]
# --- Drop duplicates & save overall result ---
df = df.drop_duplicates().reset_index(drop=True)
df.to_csv(OUTPUT_CSV, index=False)
print(f"✅ Saved {len(df)} unique matching rows to {OUTPUT_CSV}")
# =========================
# LM Studio event summary generation (group by By User, then date asc)
# =========================
if not df.empty:
grouped = df.groupby([TARGET_COL_1, "By User", ENTERED_COL], dropna=False)
summaries = [] # list of tuples (by_user, mmdd, sentence)
for keys, gdf in grouped:
applied_to, by_user, entered_on = keys
if not applied_to or str(applied_to).strip() == "":
continue
mmdd_vals = gdf[ENTERED_MMDD_COL].dropna().astype(str)
mmdd = next((v for v in mmdd_vals if v.strip()), "")
payload = dataframe_to_compact_event(gdf)
prompt = (
"You are a compliance and information security analyst. "
"Given the following grouped audit data, produce ONE clear and concise sentence summarizing the event. "
"Include: (1) who performed the action (By User, include name and ID if available), "
"(2) who the change applied to (Applied to), "
"(3) the full list of role names that were assigned or added (from 'Instance that Changed' and 'Added'), "
"and (4) the date of the event. "
"Always mention the specific role titles exactly as shown in the data. "
"If multiple roles were assigned, list them all in a natural phrase like "
"'assigned the A, B, and C roles'. "
"Do not include raw JSON, extra commentary, or line breaks. Return only one sentence.\n\n"
f"Audit Data (JSON):\n{payload}"
)
try:
resp = client.chat.completions.create(
model=LLM_MODEL,
messages=[
{"role": "system", "content": "You write terse, clear compliance summaries."},
{"role": "user", "content": prompt},
],
temperature=0.2,
)
one_liner = (resp.choices[0].message.content or "").strip()
except Exception as e:
one_liner = f"[LLM ERROR] {e}"
summaries.append((by_user or "Unknown User", mmdd, one_liner))
# Group by By User, sort each user's entries by mm/dd asc, write file (OVERWRITE)
grouped_summaries: dict[str, list[tuple[str, str]]] = defaultdict(list)
for by_user, mmdd, line in summaries:
grouped_summaries[by_user].append((mmdd, line))
for user in grouped_summaries:
grouped_summaries[user].sort(key=lambda x: x[0] or "")
with open(EVENT_LOG, "w", encoding="utf-8") as f:
for user in sorted(grouped_summaries.keys()):
f.write(f"=== {user} ===\n")
for mmdd, line in grouped_summaries[user]:
prefix = f"{mmdd} - " if mmdd else ""
f.write(f"{prefix}{line}\n")
f.write("\n")
total_events = sum(len(v) for v in grouped_summaries.values())
print(f"📝 Overwrote {EVENT_LOG} with {total_events} grouped event summaries")
else:
print(" No matching rows found; nothing to summarize.")