okta and workday scripts
This commit is contained in:
326
workday_monthly_admin_activities_audit/bkup-process.py
Normal file
326
workday_monthly_admin_activities_audit/bkup-process.py
Normal file
@@ -0,0 +1,326 @@
|
||||
# process.py
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
import hashlib
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
import requests
|
||||
import pandas as pd
|
||||
import spacy
|
||||
from openai import OpenAI
|
||||
from tqdm import tqdm
|
||||
|
||||
# =========================
|
||||
# Configuration
|
||||
# =========================
|
||||
#DEFAULT_LM_IP = "192.168.1.221" # default LM Studio host (without /v1)
|
||||
DEFAULT_LM_IP = "10.81.209.99" # default LM Studio host (without /v1)
|
||||
LLM_MODEL = "openai/gpt-oss-20b"
|
||||
LLM_API_KEY = "not-needed" # LM Studio typically doesn't require an API key
|
||||
|
||||
INPUT_CSV = "test.csv"
|
||||
OUTPUT_CSV = "test_with_names.csv"
|
||||
EVENT_LOG = "event_log.txt"
|
||||
|
||||
# Columns to process
|
||||
SOURCE_COL_1 = "Instance that Changed"
|
||||
TARGET_COL_1 = "Applied to"
|
||||
|
||||
SOURCE_COL_2 = "Added"
|
||||
TARGET_COL_2 = "Added Applied to"
|
||||
|
||||
ENTERED_COL = "Entered On"
|
||||
ENTERED_MMDD_COL = "Entered On (MM/DD)"
|
||||
|
||||
# Values to ignore entirely (case-insensitive)
|
||||
AUTO_STRINGS = {"automatic complete"}
|
||||
|
||||
def is_auto(val) -> bool:
|
||||
return isinstance(val, str) and val.strip().lower() in AUTO_STRINGS
|
||||
|
||||
# Regex helpers
|
||||
DELIM_SPLIT = re.compile(r"\s*[\/|\-–—]\s*")
|
||||
KEEP_CHARS = re.compile(r"[^A-Za-zÀ-ÖØ-öø-ÿ' .\-]")
|
||||
|
||||
def clean_person(text: str) -> str:
|
||||
"""Clean extracted name by removing job codes/fragments after dashes/slashes; keep name-ish chars."""
|
||||
if not text:
|
||||
return ""
|
||||
first = DELIM_SPLIT.split(text, maxsplit=1)[0]
|
||||
first = KEEP_CHARS.sub("", first).strip()
|
||||
return re.sub(r"\s{2,}", " ", first)
|
||||
|
||||
# =========================
|
||||
# LM Studio reachability
|
||||
# =========================
|
||||
def check_lmstudio(ip: str) -> str:
|
||||
"""
|
||||
Ensure LM Studio endpoint is reachable; if not, prompt for IP until it is.
|
||||
Returns the validated base URL like "http://<ip>:1234/v1".
|
||||
"""
|
||||
def _ok(url: str) -> bool:
|
||||
try:
|
||||
r = requests.get(url.rstrip("/") + "/models", timeout=5)
|
||||
return r.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
base_url = f"http://{ip}:1234/v1"
|
||||
if _ok(base_url):
|
||||
print(f"✅ LM Studio reachable at {base_url}")
|
||||
return base_url
|
||||
|
||||
print(f"❌ Could not reach LM Studio at {base_url}")
|
||||
while True:
|
||||
new_ip = input("Enter LM Studio IP address (e.g. 192.168.1.221): ").strip()
|
||||
if not new_ip:
|
||||
print("Aborted: No IP provided.")
|
||||
sys.exit(1)
|
||||
base_url = f"http://{new_ip}:1234/v1"
|
||||
print(f"🔍 Retesting {base_url}...")
|
||||
if _ok(base_url):
|
||||
print(f"✅ LM Studio reachable at {base_url}")
|
||||
return base_url
|
||||
else:
|
||||
print("❌ Still unreachable. Try again or Ctrl+C to exit.")
|
||||
|
||||
# Perform reachability check BEFORE any processing
|
||||
LLM_BASE_URL = check_lmstudio(DEFAULT_LM_IP)
|
||||
client = OpenAI(base_url=LLM_BASE_URL, api_key=LLM_API_KEY)
|
||||
|
||||
# =========================
|
||||
# spaCy model (Transformer)
|
||||
# =========================
|
||||
print("🔍 Loading spaCy transformer model: en_core_web_trf")
|
||||
nlp = spacy.load(
|
||||
"en_core_web_trf",
|
||||
exclude=["parser", "tagger", "attribute_ruler", "lemmatizer", "morphologizer"],
|
||||
)
|
||||
print("✅ spaCy model loaded successfully.")
|
||||
|
||||
def extract_names(text: str) -> str:
|
||||
"""Extract distinct PERSON names using spaCy Transformer model."""
|
||||
if not isinstance(text, str) or not text.strip():
|
||||
return ""
|
||||
doc = nlp(text)
|
||||
names, seen = [], set()
|
||||
for ent in doc.ents:
|
||||
if ent.label_ == "PERSON":
|
||||
cleaned = clean_person(ent.text)
|
||||
key = cleaned.lower()
|
||||
if cleaned and key not in seen:
|
||||
seen.add(key)
|
||||
names.append(cleaned)
|
||||
return ", ".join(names)
|
||||
|
||||
def insert_after(df: pd.DataFrame, after_col: str, new_col: str, values: pd.Series) -> None:
|
||||
"""Insert new_col immediately after after_col (drop existing if present)."""
|
||||
if new_col in df.columns:
|
||||
df.drop(columns=[new_col], inplace=True)
|
||||
idx = df.columns.get_loc(after_col) + 1
|
||||
df.insert(idx, new_col, values)
|
||||
|
||||
def dataframe_to_compact_event(df: pd.DataFrame) -> str:
|
||||
"""Compact JSON payload for a grouped event (keeps unique values per column)."""
|
||||
def uniq(col):
|
||||
return sorted([v for v in df[col].dropna().unique().tolist()]) if col in df else []
|
||||
payload = {
|
||||
"applied_to": uniq(TARGET_COL_1),
|
||||
"by_user": uniq("By User"),
|
||||
"in_transaction": uniq("In Transaction"),
|
||||
"entered_on": uniq(ENTERED_COL),
|
||||
"dates_mmdd": uniq(ENTERED_MMDD_COL),
|
||||
"instances": uniq(SOURCE_COL_1),
|
||||
"added": uniq(SOURCE_COL_2),
|
||||
"row_count": int(len(df)),
|
||||
}
|
||||
return json.dumps(payload, ensure_ascii=False, indent=2)
|
||||
|
||||
# =========================
|
||||
# Main flow
|
||||
# =========================
|
||||
|
||||
# If processed CSV already exists, skip straight to summarization
|
||||
if os.path.exists(OUTPUT_CSV):
|
||||
print(f"⚡ Skipping CSV processing — {OUTPUT_CSV} already exists.")
|
||||
df = pd.read_csv(OUTPUT_CSV)
|
||||
# Ensure MM/DD exists (for old CSVs)
|
||||
if ENTERED_MMDD_COL not in df.columns and ENTERED_COL in df.columns:
|
||||
ts = pd.to_datetime(df[ENTERED_COL], errors="coerce")
|
||||
df[ENTERED_MMDD_COL] = ts.dt.strftime("%m/%d").fillna("")
|
||||
else:
|
||||
print("⚙️ Processing CSV to extract names and generate output...")
|
||||
|
||||
# Load CSV
|
||||
df = pd.read_csv(INPUT_CSV)
|
||||
|
||||
# Derive Entered On (MM/DD)
|
||||
if ENTERED_COL in df.columns:
|
||||
try:
|
||||
ts = pd.to_datetime(df[ENTERED_COL], format="mixed", errors="coerce")
|
||||
except TypeError:
|
||||
ts = pd.to_datetime(df[ENTERED_COL], errors="coerce")
|
||||
df[ENTERED_MMDD_COL] = ts.dt.strftime("%m/%d").fillna("")
|
||||
else:
|
||||
df[ENTERED_MMDD_COL] = ""
|
||||
|
||||
# Live progress counters for names across both columns
|
||||
name_counter = Counter()
|
||||
|
||||
def _process_series_with_progress(series: pd.Series, desc: str) -> pd.Series:
|
||||
"""Iterate with progress, update name_counter, and return extracted names Series."""
|
||||
values = series.fillna("").astype(str).tolist()
|
||||
out = []
|
||||
total = len(values)
|
||||
if total == 0:
|
||||
return pd.Series([], dtype=object)
|
||||
step = max(10, total // 20) # update ~every 5% (at least every 10 rows)
|
||||
pbar = tqdm(values, desc=f"NER: {desc}", leave=True)
|
||||
for i, text in enumerate(pbar, start=1):
|
||||
names = extract_names(text)
|
||||
# Update running totals (ignore "Automatic Complete")
|
||||
for n in [x.strip() for x in names.split(",") if x.strip()]:
|
||||
if n.lower() not in AUTO_STRINGS:
|
||||
name_counter[n] += 1
|
||||
out.append(names)
|
||||
# Periodic status refresh
|
||||
if i % step == 0 or i == total:
|
||||
top = ", ".join(f"{n}:{c}" for n, c in name_counter.most_common(3))
|
||||
pbar.set_postfix_str(f"unique={len(name_counter)} top=[{top}]")
|
||||
return pd.Series(out, index=series.index, dtype=object)
|
||||
|
||||
# 1) Extract from "Instance that Changed" -> "Applied to"
|
||||
if SOURCE_COL_1 in df.columns:
|
||||
applied_series = _process_series_with_progress(df[SOURCE_COL_1], SOURCE_COL_1)
|
||||
insert_after(df, SOURCE_COL_1, TARGET_COL_1, applied_series)
|
||||
else:
|
||||
df[TARGET_COL_1] = ""
|
||||
|
||||
# 1a) Simplified quick-fill:
|
||||
# If "Applied to" has a value, always copy it to "Added Applied to"
|
||||
if SOURCE_COL_2 in df.columns:
|
||||
if TARGET_COL_2 not in df.columns:
|
||||
df[TARGET_COL_2] = ""
|
||||
for i, row in df.iterrows():
|
||||
name = str(row.get(TARGET_COL_1, "")).strip()
|
||||
aat = str(row.get(TARGET_COL_2, "")).strip()
|
||||
if name and not aat:
|
||||
df.at[i, TARGET_COL_2] = name
|
||||
else:
|
||||
df[TARGET_COL_2] = ""
|
||||
|
||||
# 2) Extract from "Added" -> "Added Applied to" (skip rows with value already set OR empty Added)
|
||||
if SOURCE_COL_2 in df.columns:
|
||||
mask_need = (df[TARGET_COL_2].fillna("").str.strip() == "") & (df[SOURCE_COL_2].fillna("").str.strip() != "")
|
||||
idxs = df.index[mask_need].tolist()
|
||||
if idxs:
|
||||
values = df.loc[idxs, SOURCE_COL_2]
|
||||
pbar = tqdm(values.tolist(), desc=f"NER: {SOURCE_COL_2} (remaining)", leave=True)
|
||||
extracted = []
|
||||
for text in pbar:
|
||||
names = extract_names(text)
|
||||
# update counter (ignore "Automatic Complete")
|
||||
for n in [x.strip() for x in names.split(",") if x.strip()]:
|
||||
if n.lower() not in AUTO_STRINGS:
|
||||
name_counter[n] += 1
|
||||
extracted.append(names)
|
||||
df.loc[idxs, TARGET_COL_2] = extracted
|
||||
|
||||
# --- Remove any rows that are purely "Automatic Complete" in key fields ---
|
||||
for col in [SOURCE_COL_1, SOURCE_COL_2, "In Transaction"]:
|
||||
if col in df.columns:
|
||||
df = df[~df[col].apply(is_auto)]
|
||||
|
||||
# --- Keep only selected columns (incl. MM/DD) ---
|
||||
keep_cols = [
|
||||
SOURCE_COL_1,
|
||||
TARGET_COL_1,
|
||||
"In Transaction",
|
||||
SOURCE_COL_2,
|
||||
TARGET_COL_2,
|
||||
"By User",
|
||||
ENTERED_COL,
|
||||
ENTERED_MMDD_COL,
|
||||
]
|
||||
df = df[[c for c in keep_cols if c in df.columns]]
|
||||
|
||||
# --- Filter rows: keep where Applied to == Added Applied to (case-insensitive) ---
|
||||
if TARGET_COL_1 in df.columns and TARGET_COL_2 in df.columns:
|
||||
df = df[
|
||||
df[TARGET_COL_1].fillna("").str.strip().str.lower()
|
||||
== df[TARGET_COL_2].fillna("").str.strip().str.lower()
|
||||
]
|
||||
|
||||
# --- Drop duplicates & save overall result ---
|
||||
df = df.drop_duplicates().reset_index(drop=True)
|
||||
df.to_csv(OUTPUT_CSV, index=False)
|
||||
print(f"✅ Saved {len(df)} unique matching rows to {OUTPUT_CSV}")
|
||||
|
||||
# =========================
|
||||
# LM Studio event summary generation (group by By User, then date asc)
|
||||
# =========================
|
||||
if not df.empty:
|
||||
grouped = df.groupby([TARGET_COL_1, "By User", ENTERED_COL], dropna=False)
|
||||
summaries = [] # list of tuples (by_user, mmdd, sentence)
|
||||
|
||||
for keys, gdf in grouped:
|
||||
applied_to, by_user, entered_on = keys
|
||||
if not applied_to or str(applied_to).strip() == "":
|
||||
continue
|
||||
|
||||
mmdd_vals = gdf[ENTERED_MMDD_COL].dropna().astype(str)
|
||||
mmdd = next((v for v in mmdd_vals if v.strip()), "")
|
||||
|
||||
payload = dataframe_to_compact_event(gdf)
|
||||
|
||||
prompt = (
|
||||
"You are a compliance and information security analyst. "
|
||||
"Given the following grouped audit data, produce ONE clear and concise sentence summarizing the event. "
|
||||
"Include: (1) who performed the action (By User, include name and ID if available), "
|
||||
"(2) who the change applied to (Applied to), "
|
||||
"(3) the full list of role names that were assigned or added (from 'Instance that Changed' and 'Added'), "
|
||||
"and (4) the date of the event. "
|
||||
"Always mention the specific role titles exactly as shown in the data. "
|
||||
"If multiple roles were assigned, list them all in a natural phrase like "
|
||||
"'assigned the A, B, and C roles'. "
|
||||
"Do not include raw JSON, extra commentary, or line breaks. Return only one sentence.\n\n"
|
||||
f"Audit Data (JSON):\n{payload}"
|
||||
)
|
||||
|
||||
try:
|
||||
resp = client.chat.completions.create(
|
||||
model=LLM_MODEL,
|
||||
messages=[
|
||||
{"role": "system", "content": "You write terse, clear compliance summaries."},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.2,
|
||||
)
|
||||
one_liner = (resp.choices[0].message.content or "").strip()
|
||||
except Exception as e:
|
||||
one_liner = f"[LLM ERROR] {e}"
|
||||
|
||||
summaries.append((by_user or "Unknown User", mmdd, one_liner))
|
||||
|
||||
# Group by By User, sort each user's entries by mm/dd asc, write file (OVERWRITE)
|
||||
grouped_summaries: dict[str, list[tuple[str, str]]] = defaultdict(list)
|
||||
for by_user, mmdd, line in summaries:
|
||||
grouped_summaries[by_user].append((mmdd, line))
|
||||
|
||||
for user in grouped_summaries:
|
||||
grouped_summaries[user].sort(key=lambda x: x[0] or "")
|
||||
|
||||
with open(EVENT_LOG, "w", encoding="utf-8") as f:
|
||||
for user in sorted(grouped_summaries.keys()):
|
||||
f.write(f"=== {user} ===\n")
|
||||
for mmdd, line in grouped_summaries[user]:
|
||||
prefix = f"{mmdd} - " if mmdd else ""
|
||||
f.write(f"{prefix}{line}\n")
|
||||
f.write("\n")
|
||||
|
||||
total_events = sum(len(v) for v in grouped_summaries.values())
|
||||
print(f"📝 Overwrote {EVENT_LOG} with {total_events} grouped event summaries")
|
||||
else:
|
||||
print("ℹ️ No matching rows found; nothing to summarize.")
|
||||
Reference in New Issue
Block a user