import subprocess
import sys
import os
import matplotlib.pyplot as plt
import pickle
import datetime
from tqdm import tqdm
from gensim.models import CoherenceModel, LdaModel
from gensim.corpora import Dictionary

REQUIRED_PACKAGES = ["gensim", "matplotlib", "tqdm"]

def install_packages():
    """Install missing required packages."""
    for package in REQUIRED_PACKAGES:
        try:
            __import__(package)
        except ImportError:
            print(f"Package '{package}' is missing. Attempting to install...")
            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            except Exception as e:
                print(f"Failed to install package '{package}': {e}")

# Ensure required packages are installed
install_packages()

def explore_topic_modeling(
    ngram_texts, path_dset, dset_name, min_topics=2, max_topics=10, step=1, no_below=3, no_above=0.8
):
    """Perform coherence test and create LDA topic modeling, saving outputs with a timestamp."""
    timestamp = datetime.datetime.now().strftime("%Y%m%dT%H%M")
    output_dir = os.path.join(path_dset)
    os.makedirs(output_dir, exist_ok=True)

    dictionary = Dictionary(ngram_texts)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    corpus = [dictionary.doc2bow(text) for text in ngram_texts]

    dictionary_path = os.path.join(output_dir, f"{dset_name}_dictionary_{timestamp}.dict")
    corpus_path = os.path.join(output_dir, f"{dset_name}_corpus_{timestamp}.pkl")
    dictionary.save(dictionary_path)
    with open(corpus_path, "wb") as f:
        pickle.dump(corpus, f)

    coherence_values = []
    topic_range = range(min_topics, max_topics + 1, step)
    
    for num_topics in tqdm(topic_range, desc="LDA Topic Modeling"):
        lda_model = LdaModel(
            corpus=corpus, num_topics=num_topics, id2word=dictionary, random_state=42, passes=10
        )
        coherence_model = CoherenceModel(
            model=lda_model, texts=ngram_texts, dictionary=dictionary, coherence='c_v'
        )
        coherence_values.append(coherence_model.get_coherence())

    optimal_topic_no = topic_range[coherence_values.index(max(coherence_values))]

    coherence_path = os.path.join(output_dir, f"{dset_name}_coherence_scores_{timestamp}.pkl")
    with open(coherence_path, "wb") as f:
        pickle.dump(coherence_values, f)

    plt.figure(figsize=(10, 5))
    plt.plot(topic_range, coherence_values, marker="o", label="Coherence Score")
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence Score")
    plt.title("Coherence Score vs. Number of Topics")
    plt.legend()
    plt.grid()
    
    coherence_plot_path = os.path.join(output_dir, f"{dset_name}_coherence_plot_{timestamp}.png")
    plt.savefig(coherence_plot_path)
    plt.close()

    lda_model = LdaModel(
        corpus=corpus, num_topics=optimal_topic_no, id2word=dictionary, random_state=42, passes=20
    )
    lda_model_path = os.path.join(output_dir, f"{dset_name}_lda_model_{timestamp}.model")
    lda_model.save(lda_model_path)

    return {
        "dictionary": dictionary_path,
        "corpus": corpus_path,
        "lda_model": lda_model_path,
        "coherence_scores": coherence_path,
        "coherence_plot": coherence_plot_path,
        "optimal_topics": optimal_topic_no
    }
