# laura maria engist, 2025
# The K-Means Pipeline
# varying / coming from SMAC: gap-open and gap-extension penalty; init and normalize for the k-means clustering algorithm

import tempfile
from evotuner.pipelines.kmeans_pipeline import kmp_cluster_bisecting
from evotuner.pipelines.kmeans_pipeline import kmp_convert_gta_cma
from evotuner.pipelines.kmeans_pipeline import kmp_cost
import os
from evotuner import constants

class KmPipeline:
    def __init__(self, init, normalize, go, ge, return_type):
        self.init = init
        self.normalize = normalize
        self.go = go
        self.ge = ge
        self.return_type = return_type
    
    '''
    run the full k-means pipeline
    s: path to the fasta file of protein sequences
    '''
    def run_km_pipeline(self):
        s = constants.S_AA
        with tempfile.TemporaryDirectory() as tmpdir:
            text_dir = os.getcwd()
            clustering = kmp_cluster_bisecting.KmpClusterBisecting(constants.PROTT5_EMBEDDINGS, tmpdir)
            e = clustering.e(s, self.init, self.normalize)

            fasta_embedded_sequences = os.path.join(tmpdir, "embedded_sequences.fasta")
            convert_and_cma = kmp_convert_gta_cma.kmpConvertCma(text_dir)
            m, minimum, maximum = convert_and_cma.gta_and_m(s, e, tmpdir, fasta_embedded_sequences)

            cost_process = kmp_cost.KmpCost(tmpdir, minimum, maximum, self.return_type)
            cost, auc = cost_process.cost(m, fasta_embedded_sequences, self.go, self.ge)

            return cost, minimum, maximum