import os
import sys
import subprocess
import pickle
import datetime
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from tqdm import tqdm

REQUIRED_PACKAGES = ["numpy", "matplotlib", "tqdm", "scikit-learn"]

def install_packages():
    """Install missing required packages."""
    for package in REQUIRED_PACKAGES:
        try:
            __import__(package)
        except ImportError:
            print(f"Package '{package}' is missing. Attempting to install...")
            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            except Exception as e:
                print(f"Failed to install package '{package}': {e}")

# Ensure required packages are installed
install_packages()

import os
import pickle
import datetime
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from tqdm import tqdm

def explore_kmeans_clustering(
    matrix_tfidf,
    vectorizer,
    path_dset,
    dset_name,
    timestamp=datetime.datetime.now().strftime("%Y%m%dT%H%M"),
    min_clusters=2,
    max_clusters=10,
    step=1
):
    """Perform K-means clustering with TF-IDF vectors over a range of k values."""
    output_dir = os.path.join(path_dset)
    os.makedirs(output_dir, exist_ok=True)

    # Define the range of clusters to explore
    cluster_range = range(min_clusters, max_clusters + 1, step)
    inertia_values, silhouette_values = [], []

    # Perform K-means clustering for each k in the range
    for k in tqdm(cluster_range, desc="K-means Clustering"):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
        kmeans.fit(matrix_tfidf)
        inertia_values.append(kmeans.inertia_)
        silhouette_values.append(silhouette_score(matrix_tfidf, kmeans.labels_) if k > 1 else np.nan)

    # Determine the best k based on inertia and silhouette score
    best_k_inertia = cluster_range[np.argmin(inertia_values)]
    valid_silhouettes = [(idx, val) for idx, val in enumerate(silhouette_values) if not np.isnan(val)]
    best_k_silhouette = cluster_range[max(valid_silhouettes, key=lambda x: x[1])[0]] if valid_silhouettes else best_k_inertia

    # Final K-means model with the best k (based on inertia)
    final_k = best_k_inertia
    final_kmeans = KMeans(n_clusters=final_k, random_state=42, n_init="auto")
    final_kmeans.fit(matrix_tfidf)
    cluster_assignments = final_kmeans.labels_

    # Save the model, vectorizer, cluster assignments, and scores
    model_path = os.path.join(output_dir, f"{dset_name}_kmeans_model_{timestamp}.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(final_kmeans, f)

    vectorizer_path = os.path.join(output_dir, f"{dset_name}_tfidf_vectorizer_{timestamp}.pkl")
    with open(vectorizer_path, "wb") as f:
        pickle.dump(vectorizer, f)

    cluster_assignments_path = os.path.join(output_dir, f"{dset_name}_cluster_assignments_{timestamp}.pkl")
    with open(cluster_assignments_path, "wb") as f:
        pickle.dump(cluster_assignments, f)

    scores_path = os.path.join(output_dir, f"{dset_name}_cluster_scores_{timestamp}.pkl")
    with open(scores_path, "wb") as f:
        pickle.dump({"inertia": inertia_values, "silhouette": silhouette_values}, f)

    # Plot the elbow method graph
    elbow_plot_path = os.path.join(output_dir, f"{dset_name}_elbow_plot_{timestamp}.png")
    plt.figure(figsize=(8, 5))
    plt.plot(list(cluster_range), inertia_values, marker="o")
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("Inertia (Sum of Squares)")
    plt.title("Elbow Method: Inertia vs. Number of Clusters")
    plt.axvline(best_k_inertia, color="red", linestyle="--", label=f"Best k (Inertia) = {best_k_inertia}")
    plt.grid()
    plt.legend()
    plt.savefig(elbow_plot_path)
    plt.close()

    # Plot the silhouette score graph
    silhouette_plot_path = os.path.join(output_dir, f"{dset_name}_silhouette_plot_{timestamp}.png")
    plt.figure(figsize=(8, 5))
    plt.plot(list(cluster_range), silhouette_values, marker="o", color="green")
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("Silhouette Score")
    plt.title("Silhouette Score vs. Number of Clusters")
    plt.axvline(best_k_silhouette, color="red", linestyle="--", label=f"Best k (Silhouette) = {best_k_silhouette}")
    plt.grid()
    plt.legend()
    plt.savefig(silhouette_plot_path)
    plt.close()

    return {
        "model_path": model_path,
        "vectorizer_path": vectorizer_path,
        "cluster_assignments_path": cluster_assignments_path,
        "scores_path": scores_path,
        "elbow_plot_path": elbow_plot_path,
        "silhouette_plot_path": silhouette_plot_path,
        "best_k_inertia": best_k_inertia,
        "best_k_silhouette": best_k_silhouette,
        "final_k": final_k,
        "inertia_values": inertia_values,
        "silhouette_values": silhouette_values,
        "cluster_range": list(cluster_range),
    }

import os
import pickle
import datetime
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from tqdm import tqdm

def explore_kmeans_clustering_minibatch(
    matrix_tfidf,
    vectorizer,
    path_dset,
    dset_name,
    timestamp=datetime.datetime.now().strftime("%Y%m%dT%H%M"),
    min_clusters=2,
    max_clusters=10,
    step=1,
    batch_size=1024
):
    """Perform MiniBatch K-means clustering with TF-IDF vectors over a range of k values."""
    output_dir = os.path.join(path_dset)
    os.makedirs(output_dir, exist_ok=True)

    # Define the range of clusters to explore
    cluster_range = range(min_clusters, max_clusters + 1, step)
    inertia_values, silhouette_values = [], []

    # Perform MiniBatch K-means clustering for each k in the range
    for k in tqdm(cluster_range, desc="MiniBatch K-means Clustering"):
        kmeans = MiniBatchKMeans(n_clusters=k, batch_size=batch_size, random_state=42, n_init="auto")
        kmeans.fit(matrix_tfidf)
        inertia_values.append(kmeans.inertia_)
        silhouette_values.append(silhouette_score(matrix_tfidf, kmeans.labels_) if k > 1 else np.nan)

    # Determine the best k based on inertia and silhouette score
    best_k_inertia = cluster_range[np.argmin(inertia_values)]
    valid_silhouettes = [(idx, val) for idx, val in enumerate(silhouette_values) if not np.isnan(val)]
    best_k_silhouette = cluster_range[max(valid_silhouettes, key=lambda x: x[1])[0]] if valid_silhouettes else best_k_inertia

    # Final MiniBatch K-means model with the best k (based on inertia)
    final_k = best_k_inertia
    final_kmeans = MiniBatchKMeans(n_clusters=final_k, batch_size=batch_size, random_state=42, n_init="auto")
    final_kmeans.fit(matrix_tfidf)
    cluster_assignments = final_kmeans.labels_

    # Save the model, vectorizer, cluster assignments, and scores
    model_path = os.path.join(output_dir, f"{dset_name}_kmeans_model_{timestamp}.pkl")
    with open(model_path, "wb") as f:
        pickle.dump(final_kmeans, f)

    vectorizer_path = os.path.join(output_dir, f"{dset_name}_tfidf_vectorizer_{timestamp}.pkl")
    with open(vectorizer_path, "wb") as f:
        pickle.dump(vectorizer, f)

    cluster_assignments_path = os.path.join(output_dir, f"{dset_name}_cluster_assignments_{timestamp}.pkl")
    with open(cluster_assignments_path, "wb") as f:
        pickle.dump(cluster_assignments, f)

    scores_path = os.path.join(output_dir, f"{dset_name}_cluster_scores_{timestamp}.pkl")
    with open(scores_path, "wb") as f:
        pickle.dump({"inertia": inertia_values, "silhouette": silhouette_values}, f)

    # Plot the elbow method graph
    elbow_plot_path = os.path.join(output_dir, f"{dset_name}_elbow_plot_{timestamp}.png")
    plt.figure(figsize=(8, 5))
    plt.plot(list(cluster_range), inertia_values, marker="o")
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("Inertia (Sum of Squares)")
    plt.title("Elbow Method: Inertia vs. Number of Clusters")
    plt.axvline(best_k_inertia, color="red", linestyle="--", label=f"Best k (Inertia) = {best_k_inertia}")
    plt.grid()
    plt.legend()
    plt.savefig(elbow_plot_path)
    plt.close()

    # Plot the silhouette score graph
    silhouette_plot_path = os.path.join(output_dir, f"{dset_name}_silhouette_plot_{timestamp}.png")
    plt.figure(figsize=(8, 5))
    plt.plot(list(cluster_range), silhouette_values, marker="o", color="green")
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("Silhouette Score")
    plt.title("Silhouette Score vs. Number of Clusters")
    plt.axvline(best_k_silhouette, color="red", linestyle="--", label=f"Best k (Silhouette) = {best_k_silhouette}")
    plt.grid()
    plt.legend()
    plt.savefig(silhouette_plot_path)
    plt.close()

    return {
        "model_path": model_path,
        "vectorizer_path": vectorizer_path,
        "cluster_assignments_path": cluster_assignments_path,
        "scores_path": scores_path,
        "elbow_plot_path": elbow_plot_path,
        "silhouette_plot_path": silhouette_plot_path,
        "best_k_inertia": best_k_inertia,
        "best_k_silhouette": best_k_silhouette,
        "final_k": final_k,
        "inertia_values": inertia_values,
        "silhouette_values": silhouette_values,
        "cluster_range": list(cluster_range),
    }
