import os
import pandas as pd

try:
    import google.colab
    REPO_URL = "https://github.com/wtheisen/nd-cse-30124-homeworks.git"

    REPO_PATH = "/content/nd-cse-30124-homeworks"
    L_PATH = "nd-cse-30124-homeworks/evidence/lab01"

    %cd /content/
    !rm -rf {REPO_PATH}

    # Clone repo
    if not os.path.exists(REPO_PATH):
        !git clone {REPO_URL}

        # cd into the data folder
        %cd {L_PATH}
        !pwd

except ImportError:
    print("Unable to download repo, either:")
    print("\tA.) You're not on colab")
    print("\tB.) It has already been cloned")

# TODO: Load the dataset via pandas

# TODO: Print the first 5 rows of the dataframe

# Count NaN values in each column
df.isna().sum()

# Check if a specific column has NaN values
df['column_name'].isna()  # Returns True/False for each row

# Filter to rows where a column is NOT NaN
df[df['column_name'].notna()]

# TODO: Split the data based on whether good_suspect is filled in

print(f"Training samples: {len(training_df)}")
print(f"Testing samples: {len(testing_df)}")
print(f"Total: {len(training_df) + len(testing_df)}")

weapons = ['gas', 'knife', 'poison', 'rope', 'bag', 'gun']

weapons = ['gas', 'knife', 'poison', 'rope', 'bag', 'gun']

def count_weapon_mentions(row, weapons):
    """
    Count how many weapon keywords appear across all statements for a suspect.

    Args:
        row (pandas.Series): A pandas Series representing one suspect's data
        weapons (list): List of weapon keywords to search for

    Returns:
        weapon_count (int): Total count of weapon mentions
    """
    count = 0

    # TODO: Get all statement columns (statement_1, statement_2, ..., statement_10)

    # TODO: Combine all statements into one text block
    
    # TODO: Count each weapon keyword
    
    return count

# TODO: Apply to testing data using df.apply() with axis=1 (apply to each row)

print("Weapon mentions per suspect:")
print(testing_df[['suspect_name', 'weapon_mentions']].head(10))

import numpy as np

# A suspect's lying probabilities for 10 statements
probs = [0.75, 0.82, 0.68, 0.79, 0.85, 0.71, 0.88, 0.73, 0.81, 0.77]

# Calculate average
average = np.mean(probs)  # Returns: 0.779

import numpy as np

def avg_lying_probs(row):
    """
    Extract all lying probability values from a row and then calculate the average.

    Args:
        row (pandas.Series): A pandas Series representing one suspect's data

    Returns:
        avg (float): Average lying probability (between 0 and 1)
    """

    # TODO: Get all lying probability columns, we can use col.startswith to get them

    # TODO: Calculate the average of all lying probabilities

# TODO: Apply to testing data

print("Total lying probability (average) per suspect:")
print(testing_df[['suspect_name', 'total_lying_prob']].head(10))

# Select specific columns and convert to numpy array
X = df[['feature1', 'feature2']].values  # .values converts to numpy array

# Get labels
y = df['label_column'].values

# TODO: Prepare testing features to match training format

# TODO: Training data already has features and labels

# Medical diagnosis expert system (simplified)
if fever > 101 and cough and fatigue:
    diagnosis = "flu"
elif fever > 101 and rash:
    diagnosis = "measles"
elif headache and stiff_neck and fever:
    diagnosis = "meningitis"
else:
    diagnosis = "unknown"

def expert_classify_suspect(weapon_mentions, lying_prob):
    """
    Rule-based expert system for classifying suspects.
    Rules derived from "domain expert" knowledge about polygraph analysis.

    Args:
        weapon_mentions: Number of weapon keywords in statements
        lying_prob: Average lying probability across all statements (0-1)

    Returns:
        1 if suspect is worth investigating ("good suspect")
        0 otherwise
    """
    # Rule 1: High average lying probability is very suspicious
    if lying_prob > 0.6:
        return 1  # Good suspect

    # Rule 2: Multiple weapon mentions combined with moderate lying
    if weapon_mentions >= 3 and lying_prob > 0.5:
        return 1  # Good suspect

    # Rule 3: Many weapon mentions is suspicious regardless
    if weapon_mentions >= 5:
        return 1  # Good suspect

    # Rule 4: Very low lying probability - probably innocent
    if lying_prob < 0.25:
        return 0  # Not a good suspect

    # Default: not enough evidence
    return 0

# TODO: Apply expert system to testing data

# See who the expert system flags
expert_suspects = testing_df[testing_df['expert_prediction'] == 1]
print(f"Expert system identified {len(expert_suspects)} suspects:")
print(expert_suspects[['suspect_name', 'weapon_mentions', 'total_lying_prob']].to_string())

from sklearn.neighbors import KNeighborsClassifier

# Step 1: Create the model
model = KNeighborsClassifier(n_neighbors=N)

# Step 2: Train on data
model.fit(X_train, y_train)

# Step 3: Make predictions
predictions = model.predict(X_test)

from sklearn.neighbors import KNeighborsClassifier

# TODO: Create kNN classifier for k = 3

# TODO: Fit on training data

print(f"Number of training samples: {len(X_train)}")
print(f"Number of neighbors (k): {knn.n_neighbors}")

# TODO: Predict which suspects are "good suspects"

# TODO: Add predictions back to dataframe

# TODO: Filter to good suspects, those with knn_prediction == 1

print(f"kNN (k=3) identified {len(knn_suspects)} suspects to investigate further:")
print(knn_suspects[['suspect_name', 'weapon_mentions', 'total_lying_prob', 'knn_prediction']].to_string())

for k in [1, 3, 5, 7, 9]:
    # TODO: Try the different values of k listed above

    print(f"{len(knn_suspects)} suspects - {knn_suspects['suspect_name'].tolist()}")

import os, json

ASS_PATH = "nd-cse-30124-homeworks/labs"
ASS = "lab01"

try:
    from google.colab import _message, files

    # where you WANT it to live (repo folder)
    repo_ipynb_path = f"/content/{ASS_PATH}/{ASS}/{ASS}.ipynb"

    # grab current notebook contents from the UI
    nb = _message.blocking_request("get_ipynb", timeout_sec=1)["ipynb"]

    # write it into the repo folder as a real file
    os.makedirs(os.path.dirname(repo_ipynb_path), exist_ok=True)
    with open(repo_ipynb_path, "w", encoding="utf-8") as f:
        json.dump(nb, f)

    # convert + download html
    !jupyter nbconvert --to html "{repo_ipynb_path}"
    files.download(repo_ipynb_path.replace(".ipynb", ".html"))
except:
    import subprocess

    nb_fp = os.getcwd() + f'/{ASS}.ipynb'
    print(os.getcwd())

    subprocess.run(["jupyter", "nbconvert", "--to", "html", nb_fp], check=True)
finally:
    print('[WARNING]: Unable to export notebook as .html')

Task ID	Description	Points
00	Load and Split Dataset	1
00-1	- Load Dataset
00-2	- Explore the Data
00-3	- Split into Training and Testing DataFrames
01	Creating Features	2
01-1	- Create Weapon Keyword Feature
01-2	- Create Average Lying Probability Feature
01-3	- Prepare Feature Data
02	Expert System / Rule-Based Classification	0
02-1	- Test Expert System
03	Creating and Using kNN	1
03-1	- Create kNN
03-2	- Use kNN
04	Retrieve Suspect Statements	1
05	Generate Police Report	0

Method	Function	When to use
Mean (average)	`np.mean()`	When all values contribute equally
Sum	`np.sum()`	When you want total amount
Max	`np.max()`	When the highest value matters most
Min	`np.min()`	When the lowest value matters most
Median	`np.median()`	When you want to ignore outliers

Variable	Shape	Description	Example
X	(n_samples, n_features)	Feature matrix	`[[4, 0.85], [0, 0.12], [3, 0.78]]`
y	(n_samples,)	Target labels	`[1, 0, 1]`

Aspect	Expert Systems	Machine Learning
Knowledge source	Human experts write rules	Algorithm learns from data
Interpretability	High - you can trace exactly why	Often low - "black box"
Training data needed	None	Usually lots
Handles edge cases	Poorly - only handles coded rules	Better - generalizes patterns
Maintenance	Hard - rules become complex	Easy - just retrain with new data

Algorithm	Hyperparameter	What it controls
kNN	`n_neighbors` (k)	How many neighbors to consider
Decision Tree	`max_depth`	How deep the tree can grow
Neural Network	`learning_rate`	How fast to update weights

CSE 30124 - Introduction to Artificial Intelligence: Lab 01 (5 pts.)¶

Story Progression¶

Task 00: Load and Split Dataset (1 pt.)¶

Task 00-1: Load Dataset (0 pts.)¶

Loading the Dataset¶

Task 00-1: Code (0 pts.)¶

Task 00-1: Expected Output (0 pts.)¶

Task 00-2: Description (0 pts.)¶

Splitting the Dataset¶

Task 00-2: Code (0 pts.)¶

Task 00-2: Expected Output¶

Story Progression¶

Task 01: Create Features from Testing Data to Match Training Data (2 pts.)¶

Task 01-1: Description (0 pts.)¶

Creating the Weapon Mention Feature¶

Useful String Operations in Pandas¶

Using df.apply() to Process Each Row¶

Task 01-1: Code (1 pt.)¶

Task 01-1: Expected Output (1 pt.)¶

Task 01-2: Description (0 pts.)¶

Creating the Average Lying Probability Feature¶

Aggregation Methods¶

Example Calculation¶

Task 01-2: Code (1 pt.)¶

Task 01-2: Expected Output (1 pt.)¶

Task 01-3: Description (0 pts.)¶

Prepare Feature Matrix for Machine Learning¶

sklearn's Data Format¶

Converting DataFrame to NumPy Array¶

Task 01-3: Code (0 pts.)¶

Story Progression¶

Task 02: Expert System / Rule-Based Classification (0 pts.)¶

Task 02-1: Description (0 pts.)¶

What is an Expert System?¶

How Expert Systems Work¶

Expert Systems vs Machine Learning¶

Expert Police Suspect System Rules¶

Task 02-1: Code (0 pts.)¶

Task 02-1: Expected Output (0 pts.)¶

Story Progression¶

What is sklearn?¶

sklearn's Consistent API¶

Task 03: Use kNN to Find the Suspects (1 pt.)¶

Task 03-1: Description (0 pts.)¶

What is k-Nearest Neighbors (kNN)?¶

Task 03-1: Code (0 pts.)¶

Task 03-1: Expected Output (0 pts.)¶

Task 03-2: Description (0 pts.)¶

Using kNN to Make Predictions¶

Task 03-2: Code (0 pts.)¶

Task 03-2: Expected Output (0 pts.)¶

Story Progression¶

Examples of Hyperparameters¶

The Effect of k in kNN¶

Task 03-3: Description (0 pts.)¶

Hyperparameter Tuning¶

Task 03-3: Code (0 pts.)¶

Task 03-3: Expected Output (1 pt.)¶

Story Progression¶

Task 04: Collecting Suspect Statements (1 pt.)¶

Task 04: Description (0 pts.)¶

Task 05: Generate Police Report¶

Task 05: Description (0 pts.)¶

Task 05: Code (0 pts.)¶