import os
import time
import csv
import yaml
import shutil
from snakemake.io import expand
import pandas as pd
import glob
import sys
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
from pathlib import Path



# ===== CONFIGURATION VALIDATION =====
def validate_config(config):
    """Validate all configuration parameters"""
    required_keys = ['samples_file', 'output_dir', 'r', 's', 'run_name', 'mge_path']
    missing_keys = [key for key in required_keys if key not in config]
    
    if missing_keys:
        sys.stderr.write(f"WARNING: Missing required configuration keys: {', '.join(missing_keys)}\n")
        sys.exit(1)
    
    # Validate file existence
    if not os.path.exists(config['samples_file']):
        sys.stderr.write(f"WARNING: Samples file not found: {config['samples_file']}\n")
        sys.exit(1)
    
    if not os.path.exists(config['mge_path']):
        sys.stderr.write(f"WARNING: MitoGeneExtractor path not found: {config['mge_path']}\n")
        sys.exit(1)
    
    # Validate r and s parameters
    if not isinstance(config['r'], list) or not config['r']:
        sys.stderr.write("WARNING: 'r' must be a non-empty list\n")
        sys.exit(1)
    
    if not isinstance(config['s'], list) or not config['s']:
        sys.stderr.write("WARNING: 's' must be a non-empty list\n")
        sys.exit(1)

def validate_gene_fetch_config(config):
    """Validate gene_fetch specific configuration"""
    run_gene_fetch = config.get("run_gene_fetch", True)
    if not isinstance(run_gene_fetch, bool):
        sys.stderr.write("WARNING: 'run_gene_fetch' must be a boolean (true/false)\n")
        sys.exit(1)
    
    if run_gene_fetch:
        gene_fetch_config = config.get("gene_fetch", {})
        required_gene_fetch_keys = ['email', 'api_key', 'gene', 'input_type', 'minimum_length']
        missing_gene_fetch_keys = [key for key in required_gene_fetch_keys 
                                 if key not in gene_fetch_config or not gene_fetch_config[key]]
        
        if missing_gene_fetch_keys:
            sys.stderr.write(f"WARNING: Missing required gene_fetch configuration parameters: {', '.join(missing_gene_fetch_keys)}\n")
            sys.exit(1)
            
        # Validate input_type value
        input_type = gene_fetch_config.get("input_type", "").lower()
        if input_type not in ["taxid", "hierarchical"]:
            sys.stderr.write("WARNING: 'input_type' must be either 'taxid' or 'hierarchical'\n")
            sys.exit(1)
			
        # Validate minimum_length value
        minimum_length = gene_fetch_config.get("minimum_length", 500)
        try:
            minimum_length = int(minimum_length)
        except (ValueError, TypeError):
            sys.stderr.write("ERROR: 'minimum_length' must be an integer\n")
            sys.exit(1)
            
        # Validate samples_file exists
        samples_file = gene_fetch_config.get("samples_file", config.get("samples_file", ""))
        if not samples_file or not os.path.exists(samples_file):
            sys.stderr.write(f"WARNING: Gene fetch samples file not found: {samples_file}\n")
            sys.exit(1)
    else:
        # If not using gene_fetch, validate sequence_reference_file exists
        if 'sequence_reference_file' not in config:
            sys.stderr.write("WARNING: 'sequence_reference_file' is required when run_gene_fetch is false\n")
            sys.exit(1)
        
        if not os.path.exists(config['sequence_reference_file']):
            sys.stderr.write(f"WARNING: Sequence reference file not found: {config['sequence_reference_file']}\n")
            sys.exit(1)

def validate_structural_validation_config(config):
    """Validate structural validation configuration"""
    run_structural_validation = config.get("run_structural_validation", True)
    if not isinstance(run_structural_validation, bool):
        sys.stderr.write("WARNING: 'run_structural_validation' must be a boolean (true/false)\n")
        sys.exit(1)

def validate_taxonomic_validation_config(config):
    """Validate taxonomic validation configuration"""
    run_taxonomic_validation = config.get("run_taxonomic_validation", True)
    if not isinstance(run_taxonomic_validation, bool):
        sys.stderr.write("WARNING: 'run_taxonomic_validation' must be a boolean (true/false)\n")
        sys.exit(1)
    
    if run_taxonomic_validation:
        taxonomic_validation_params = config.get("taxonomic_validation", {})
        
        # Validate database path
        database_path = taxonomic_validation_params.get("database", "")
        if not database_path:
            sys.stderr.write("WARNING: 'database' path is required when run_taxonomic_validation is true\n")
            sys.exit(1)
        
        if not os.path.exists(database_path):
            sys.stderr.write(f"WARNING: Taxonomic validation (BLASTn) database not found: {database_path}\n")
            sys.exit(1)
		
        # Validate taxonomy file
        database_taxonomy = taxonomic_validation_params.get("database_taxonomy", "")
        if not database_taxonomy:
            sys.stderr.write("WARNING: 'database_taxonomy' path is required when run_taxonomic_validation is true\n")
            sys.exit(1)
        if not os.path.exists(database_taxonomy):
            sys.stderr.write(f"WARNING: Taxonomy file not found: {database_taxonomy}\n")
            sys.exit(1)
        
        # Validate expected taxonomy file
        expected_taxonomy = taxonomic_validation_params.get("expected_taxonomy", "")
        if not expected_taxonomy:
            sys.stderr.write("WARNING: 'expected_taxonomy' path is required when run_taxonomic_validation is true\n")
            sys.exit(1)
        if not os.path.exists(expected_taxonomy):
            sys.stderr.write(f"WARNING: Expected taxonomy file not found: {expected_taxonomy}\n")
            sys.exit(1)
        
        # Validate min_pident parameter
        min_pident = taxonomic_validation_params.get("min_pident", "80")
        try:
            min_pident_float = float(min_pident)
            if not (0 <= min_pident_float <= 100):
                sys.stderr.write("WARNING: 'min_pident' must be between 0 and 100\n")
                sys.exit(1)
        except (ValueError, TypeError):
            sys.stderr.write("WARNING: 'min_pident' must be a valid number\n")
            sys.exit(1)

# Run all validations
validate_config(config)
validate_gene_fetch_config(config)
validate_structural_validation_config(config)
validate_taxonomic_validation_config(config)

# Print configuration
print("\n=====Configuration loaded:=====\n", config) 



# ===== DATA PARSING FUNCTIONS =====
def parse_samples(samples_file):
    """Parse samples from .csv files"""
    samples = {}
    with open(samples_file, mode='r') as infile:
        reader = csv.DictReader(infile)
        
        # Get the actual column names from the CSV
        fieldnames = reader.fieldnames
        
        # Check for required columns with helpful error messages
        required_columns = {
            'ID': ['ID', 'id', 'sample_id', 'SampleID'],
            'forward': ['forward', 'fwd', 'R1', 'read1'],
            'reverse': ['reverse', 'rev', 'R2', 'read2']
        }
        
        column_mapping = {}
        missing_columns = []
        
        for required, alternatives in required_columns.items():
            found = False
            for alt in alternatives:
                if alt in fieldnames:
                    column_mapping[required] = alt
                    found = True
                    break
            if not found:
                missing_columns.append(f"{required} (acceptable alternatives: {', '.join(alternatives)})")
        
        if missing_columns:
            sys.stderr.write(f"ERROR: samples.csv is missing required columns:\n")
            for missing in missing_columns:
                sys.stderr.write(f"  - {missing}\n")
            sys.stderr.write(f"\nFound columns in CSV: {', '.join(fieldnames)}\n")
            sys.stderr.write(f"\nPlease ensure your CSV has columns for sample ID and forward/reverse reads.\n")
            sys.exit(1)
        
        # Parse rows using the mapped column names
        for row in reader:
            sample_id = row[column_mapping['ID']]
            forward_read = row[column_mapping['forward']]
            reverse_read = row[column_mapping['reverse']]
            samples[sample_id] = {"R1": forward_read, "R2": reverse_read}
    
    return samples

def parse_sequence_references(sequence_reference_file):
    """Parse references from CSV"""
    sequence_references = {}
    with open(sequence_reference_file, mode='r') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            sample_id = row['ID']
            protein_reference_path = row['protein_reference_path']
            sequence_references[sample_id] = {
                "protein_path": protein_reference_path,
            }    
    return sequence_references



# ===== CONFIGURATION PARAMETERS =====
# Load basic parameters
samples = parse_samples(config["samples_file"])
run_name = config["run_name"]
r = config["r"]
s = config["s"]
n = config["n"]
C = config["C"]
t = config["t"]
mge_path = config["mge_path"]

# Gene fetch configuration
run_gene_fetch = config.get("run_gene_fetch", True)
if run_gene_fetch:
    sequence_reference_file_path = os.path.join(config["output_dir"], "02_references", "sequence_references.csv")
else:
    sequence_reference_file_path = config["sequence_reference_file"]

# Downsampling configuration
downsampling_enabled = config.get("downsampling", {}).get("enabled", False)
max_reads = config.get("downsampling", {}).get("max_reads", 0)

# Fasta cleaner configuration
fasta_cleaner_params = config.get("fasta_cleaner", {
    "consensus_threshold": 0.5,
    "human_threshold": 0.95,
    "at_difference": 0.1,
    "at_mode": "absolute",
    "outlier_percentile": 90.0
})

# Structural validation configuration
run_structural_validation = config.get("run_structural_validation", True)
structural_validation_params = config.get("structural_validation", {
    "target": "cox1",
    "verbose": False,
    "relaxed": False
})

# Taxonomic validation configuration
run_taxonomic_validation = config.get("run_taxonomic_validation", True)
taxonomic_validation_params = config.get("taxonomic_validation", {
    "database": "",
    "verbose": True,
    "blast_options": "-evalue 1e-5 -max_target_seqs 20 -num_threads 1",
    "database_taxonomy": "",
    "expected_taxonomy": "",
    "taxval_rank": "family",
    "min_pident": "80"
})

# Rule resources
rule_resources = config["rules"]



# ===== DIRECTORY STRUCTURE SETUP =====
# Create main output directory
main_output_dir = config["output_dir"]
Path(main_output_dir).mkdir(parents=True, exist_ok=True)

# Create preprocessing directories
preprocessing_dir = os.path.join(main_output_dir, "01_preprocessing")
preprocessing_dir_merge = os.path.join(preprocessing_dir, "merge_mode")
preprocessing_dir_concat = os.path.join(preprocessing_dir, "concat_mode")

# Create barcode recovery directories
barcode_recovery_dir = os.path.join(main_output_dir, "03_barcode_recovery")
barcode_recovery_dir_merge = os.path.join(barcode_recovery_dir, "merge_mode")
barcode_recovery_dir_concat = os.path.join(barcode_recovery_dir, "concat_mode")

# Create references directory if using gene_fetch
if run_gene_fetch:
    references_dir = os.path.join(main_output_dir, "02_references")
    Path(references_dir).mkdir(parents=True, exist_ok=True)



# ===== HELPER FUNCTIONS =====
def get_hmm_file_for_target(target):
    """Map target gene to corresponding HMM file path"""
    hmm_mapping = {
        "cox1": "resources/hmm/COI-5P.hmm",
        "coi": "resources/hmm/COI-5P.hmm",
        "rbcl": "resources/hmm/rbcL.hmm"
    }
    
    target_lower = target.lower()
    if target_lower not in hmm_mapping:
        raise ValueError(f"Unknown target '{target}'. Available targets: {list(hmm_mapping.keys())}")
    
    return hmm_mapping[target_lower]

def get_protein_reference(wildcards):
    """Get protein reference path for a sample"""
    if run_gene_fetch:
        return os.path.join(main_output_dir, f"02_references/protein/{wildcards.sample}.fasta")
    else:
        sequence_refs = parse_sequence_references(config["sequence_reference_file"])
        return sequence_refs[wildcards.sample]["protein_path"]

def get_all_mge_consensus_files(mode):
    """Generate list of expected consensus files for a given mode"""
    output_dir = barcode_recovery_dir_merge if mode == "merge" else barcode_recovery_dir_concat
    files = []
    
    for sample in samples.keys():
        for r_val in r:
            for s_val in s:
                files.append(os.path.join(output_dir, f"consensus/{sample}_r_{r_val}_s_{s_val}_con_{sample}.fas"))
    
    return files

def get_mge_input_merge(wildcards):
    """Get the right input for merge mode"""
    return os.path.join(preprocessing_dir_merge, f"trimmed_data/{wildcards.sample}/{wildcards.sample}_merged_clean.fastq")

def get_mge_input_concat(wildcards):
    """Get input for concat mode"""
    if downsampling_enabled:
        return os.path.join(preprocessing_dir_concat, f"trimmed_data/{wildcards.sample}/{wildcards.sample}_concat_trimmed_downsampled.fastq")
    else:
        return os.path.join(preprocessing_dir_concat, f"trimmed_data/{wildcards.sample}/{wildcards.sample}_concat_trimmed.fq")

# Set up derived parameters
hmm_file_path = get_hmm_file_for_target(structural_validation_params["target"]) if run_structural_validation else ""
reference_dir = fasta_cleaner_params.get("reference_dir", None)
use_reference_filtering = reference_dir and reference_dir not in ["None", "null", "", None]



# ===== SNAKEMAKE RULE ALL =====
rule all:
    input:
        # ==== GENE FETCH (CONDITIONAL) ====
        (os.path.join(main_output_dir, "02_references/sequence_references.csv") if run_gene_fetch else []),
        
        # ==== PREPROCESSING OUTPUTS ====
        # Merge mode preprocessing
        os.path.join(preprocessing_dir_merge, "logs/clean_headers/clean_headers.log"),
        
        # Concat mode preprocessing
        os.path.join(preprocessing_dir_concat, "logs/concat/concat_reads.log"),
        os.path.join(preprocessing_dir_concat, "logs/trim_galore/trim_galore.log"),
        expand(os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_R1_trimmed.fastq.gz"), sample=list(samples.keys())),
        expand(os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_R2_trimmed.fastq.gz"), sample=list(samples.keys())), 
        
        # ==== BARCODE RECOVERY OUTPUTS ====
        # Merge mode barcode recovery
        os.path.join(barcode_recovery_dir_merge, f"consensus/{run_name}_cons_combined-merge.fasta"),
        os.path.join(barcode_recovery_dir_merge, "logs/mge/alignment_files.log"),
        os.path.join(barcode_recovery_dir_merge, f"{run_name}_merge-stats.csv"),
        os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner_complete.txt"),
        os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/combined_statistics.csv"),
        os.path.join(barcode_recovery_dir_merge, "logs/exonerate_int_cleanup_complete.txt"),
        os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/01_human_filtered/human_filtered.txt"),
        os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/01_human_filtered/human_filter_metrics.csv"),
        os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/02_at_filtered/at_filtered.txt"),
        os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/02_at_filtered/at_filter_summary.csv"),
        os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/03_outlier_filtered/outlier_filtered.txt"),
        os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/03_outlier_filtered/outlier_filter_summary_metrics.csv"),
        (os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/04_reference_filtered/reference_filtered.txt") if use_reference_filtering else []),
        (os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/04_reference_filtered/reference_filter_metrics.csv") if use_reference_filtering else []), 
        os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/cleaned_cons_combined.fasta"),
        os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/05_cleaned_consensus/cleaned_cons_metrics-merge.csv"),
        
        # Concat mode barcode recovery
        os.path.join(barcode_recovery_dir_concat, f"consensus/{run_name}_cons_combined-concat.fasta"),
        os.path.join(barcode_recovery_dir_concat, "logs/mge/alignment_files.log"),
        os.path.join(barcode_recovery_dir_concat, f"{run_name}_concat-stats.csv"),
        os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner_complete.txt"),
        os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/combined_statistics.csv"),
        os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/01_human_filtered/human_filtered.txt"),
        os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/01_human_filtered/human_filter_metrics.csv"),
        os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/02_at_filtered/at_filtered.txt"),
        os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/02_at_filtered/at_filter_summary.csv"),
        os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/03_outlier_filtered/outlier_filtered.txt"),
        os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/03_outlier_filtered/outlier_filter_summary_metrics.csv"),
        (os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/04_reference_filtered/reference_filtered.txt") if use_reference_filtering else []),
        (os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/04_reference_filtered/reference_filter_metrics.csv") if use_reference_filtering else []),
        os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/cleaned_cons_combined.fasta"),
        os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/05_cleaned_consensus/cleaned_cons_metrics-concat.csv"),
        os.path.join(barcode_recovery_dir_concat, "logs/exonerate_int_cleanup_complete.txt"),
		
        # ==== BARCODE RECOVERY OUTPUTS (EXTRA) ====
        os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner/fasta_cleaner_complete.txt"),
        os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner/fasta_cleaner_complete.txt"),
        os.path.join(barcode_recovery_dir, "barcode_consensus_count.tsv"),
        os.path.join(barcode_recovery_dir, "logs/barcode_consensus_count.log"),
		
		# ==== STRUCTURAL VALIDATION (CONDITIONAL) ====
        (os.path.join(main_output_dir, f"04_barcode_validation/structural/structural_validation.csv") if run_structural_validation else []),
        (os.path.join(main_output_dir, f"04_barcode_validation/structural/output_barcode_all_passing.fasta") if run_structural_validation else []),
		
		# ==== TAXONOMIC VALIDATION (CONDITIONAL) ====
        (os.path.join(main_output_dir, f"04_barcode_validation/taxonomic/metrics/01_local_blast_output.csv") if run_taxonomic_validation else []),
        (os.path.join(main_output_dir, f"04_barcode_validation/taxonomic/validated_barcodes.fasta") if run_taxonomic_validation else []),
        (os.path.join(main_output_dir, f"04_barcode_validation/taxonomic/metrics/02_taxonomic_validation.csv") if run_taxonomic_validation else []),
       
        # ==== FINAL OUTPUTS ====
        os.path.join(barcode_recovery_dir, f"{run_name}_barcode_recovery_metrics.csv"),
        os.path.join(barcode_recovery_dir, f"{run_name}_all_cons_combined.fasta"),
        (os.path.join(main_output_dir, f"{run_name}_final_metrics.csv") if run_structural_validation and run_taxonomic_validation else []),
        (os.path.join(main_output_dir, "05_barcoding_outcome/barcoding_outcome.tsv") if run_structural_validation and run_taxonomic_validation else []),
        os.path.join(preprocessing_dir_concat, "logs/final_cleanup_complete.txt"),
        os.path.join(preprocessing_dir_merge, "logs/final_cleanup_complete.txt"),
		
# ===== GENE FETCH RULE =====
if run_gene_fetch:
    rule gene_fetch:
        input:
            samples_csv=lambda wildcards: config["gene_fetch"].get("samples_file", config["samples_file"])
        output:
            sequence_references=os.path.join(main_output_dir, "02_references/sequence_references.csv"),
            protein_files=expand(
                os.path.join(main_output_dir, "02_references/protein/{sample}.fasta"),
                sample=list(samples.keys())
            )
        params:
            email=config["gene_fetch"]["email"],
            api_key=config["gene_fetch"]["api_key"],
            gene=config["gene_fetch"]["gene"],
            input_type=config["gene_fetch"]["input_type"].lower(),
            genbank=config["gene_fetch"].get("genbank", False),
            output_dir=os.path.join(main_output_dir, "02_references"),
            min_length=config["gene_fetch"].get("minimum_length", False),
            genbank_flag="--genbank" if config["gene_fetch"].get("genbank", False) else "",
            input_flag="--in" if config["gene_fetch"]["input_type"].lower() == "taxid" else "--in2"
        log:
            os.path.join(main_output_dir, "02_references/gene_fetch.log")
        threads: rule_resources["gene_fetch"]["threads"]
        resources:
            mem_mb=lambda wildcards, attempt: rule_resources["gene_fetch"]["mem_mb"] * attempt
        retries: 3
        shell:
            """
            set -euo pipefail
            
            echo "Starting gene-fetch: $(date)" > {log}
            echo "Email: {params.email}" >> {log}
            echo "Gene: {params.gene}" >> {log}
            echo "Type: protein" >> {log}
            echo "Input type: {params.input_type}" >> {log}
            echo "Genbank: {params.genbank}" >> {log}
            echo "Input samples: {input.samples_csv}" >> {log}
            echo "Output directory: {params.output_dir}" >> {log}
            
            # Run gene-fetch
            gene-fetch \\
                --email {params.email} \\
                --api-key {params.api_key} \\
                {params.input_flag} {input.samples_csv} \\
                --gene {params.gene} \\
                --type protein \\
                --protein-size {params.min_length} \\
                {params.genbank_flag} \\
                --out {params.output_dir} \\
                >> {log} 2>&1
            
            echo "Gene-fetch completed: $(date)" >> {log}
            echo "Output file: {output.sequence_references}" >> {log}
            
            # Verify output file was created
            if [ ! -f "{output.sequence_references}" ]; then
                echo "ERROR: Expected output file not found: {output.sequence_references}" >> {log}
                exit 1
            fi
            """

# ===== PREPROCESSING RULES - MERGE MODE =====
rule fastp_pe_merge:
    input:
        R1=lambda wildcards: samples[wildcards.sample]["R1"],
        R2=lambda wildcards: samples[wildcards.sample]["R2"]
    output:
        R1_trimmed=temp(os.path.join(preprocessing_dir_merge, "trimmed_data/{sample}/{sample}_R1_trimmed.fastq.gz")),
        R2_trimmed=temp(os.path.join(preprocessing_dir_merge, "trimmed_data/{sample}/{sample}_R2_trimmed.fastq.gz")),
        report=os.path.join(preprocessing_dir_merge, "trimmed_data/{sample}/{sample}_fastp_report.html"),
        json=os.path.join(preprocessing_dir_merge, "trimmed_data/{sample}/{sample}_fastp_report.json"),
        merged_reads=os.path.join(preprocessing_dir_merge, "trimmed_data/{sample}/{sample}_merged.fastq"),
        unpaired_R1=temp(os.path.join(preprocessing_dir_merge, "trimmed_data/unpaired/{sample}/{sample}_unpaired_R1.fastq")),
        unpaired_R2=temp(os.path.join(preprocessing_dir_merge, "trimmed_data/unpaired/{sample}/{sample}_unpaired_R2.fastq"))
    log:
        out=os.path.join(preprocessing_dir_merge, "logs/fastp/{sample}_fastp.out"),
        err=os.path.join(preprocessing_dir_merge, "logs/fastp/{sample}_fastp.err")
    threads: rule_resources["fastp_qc"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["fastp_qc"]["mem_mb"] * attempt
    retries: 3
    shell:
        """
        set -euo pipefail
		
        # Log job start
        echo "======== Fastp Merge Processing ========" > {log.out}
        echo "Sample: {wildcards.sample}" >> {log.out}
        echo "Started: $(date)" >> {log.out}
        echo "Input files: {input.R1}, {input.R2}" >> {log.out}
        echo "Threads: {threads}" >> {log.out}
        echo "=========================================" >> {log.out}

        # Check input files exist
        if [[ ! -f "{input.R1}" ]] || [[ ! -f "{input.R2}" ]]; then
            echo "ERROR: Input files not found" >> {log.out}
            exit 1
        fi

        # Log input file sizes
        echo "Input file sizes:" >> {log.out}
        echo "  R1: $(numfmt --to=iec $(stat -c%s {input.R1}))" >> {log.out}
        echo "  R2: $(numfmt --to=iec $(stat -c%s {input.R2}))" >> {log.out}

        # Run fastp
        echo "Running fastp: $(date)" >> {log.out}
        if fastp -i {input.R1} -I {input.R2} \\
              -o {output.R1_trimmed} -O {output.R2_trimmed} \\
              --adapter_sequence=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA \\
              --adapter_sequence_r2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT \\
              --dedup \\
              --trim_poly_g \\
              --thread {threads} \\
              --merge --merged_out {output.merged_reads} \\
              --unpaired1 {output.unpaired_R1} \\
              --unpaired2 {output.unpaired_R2} \\
              -h {output.report} -j {output.json} \\
              >> {log.out} 2>> {log.err}; then
            echo "Fastp completed successfully: $(date)" >> {log.out}
        else
            echo "ERROR: Fastp failed with exit code $?" >> {log.out}
            exit 1
        fi

        # Log output file sizes
        echo "Output file sizes:" >> {log.out}
        echo "  R1_trimmed: $(numfmt --to=iec $(stat -c%s {output.R1_trimmed}))" >> {log.out}
        echo "  R2_trimmed: $(numfmt --to=iec $(stat -c%s {output.R2_trimmed}))" >> {log.out}
        echo "  Merged: $(numfmt --to=iec $(stat -c%s {output.merged_reads}))" >> {log.out}
        echo "Processing completed: $(date)" >> {log.out}
        """

if downsampling_enabled:
    rule downsample_merge:
        input:
            merged_reads=temp(os.path.join(preprocessing_dir_merge, "trimmed_data/{sample}/{sample}_merged.fastq"))
        output:
            downsampled=temp(os.path.join(preprocessing_dir_merge, "trimmed_data/{sample}/{sample}_merged_downsampled.fastq"))
        params:
            max_reads=config["downsampling"]["max_reads"]
        log:
            out=os.path.join(preprocessing_dir_merge, "logs/downsample/{sample}.out"),
            err=os.path.join(preprocessing_dir_merge, "logs/downsample/{sample}.err")
        threads: rule_resources["downsample"]["threads"]
        resources:
            mem_mb=lambda wildcards, attempt: rule_resources["downsample"]["mem_mb"] * attempt
        retries: 3
        shell:
            """
            set -euo pipefail

            echo "Downsampling merged reads for {wildcards.sample}" > {log.out}
            echo "Started: $(date)" >> {log.out}
            echo "Target reads: {params.max_reads}" >> {log.out}

            reformat.sh in={input.merged_reads} \\
                        out={output.downsampled} \\
                        samplereadstarget={params.max_reads} \\
                        sampleseed=12345 \\
                        >> {log.out} 2>> {log.err}

            echo "Completed: $(date)" >> {log.out}
            echo "Output file size: $(numfmt --to=iec $(stat -c%s {output.downsampled}))" >> {log.out}
            """

rule clean_headers_merge:
    input:
        merged_reads=os.path.join(preprocessing_dir_merge, "trimmed_data/{sample}/{sample}_merged_downsampled.fastq") if downsampling_enabled else os.path.join(preprocessing_dir_merge, "trimmed_data/{sample}/{sample}_merged.fastq")
    output:
        clean_merged=os.path.join(preprocessing_dir_merge, "trimmed_data/{sample}/{sample}_merged_clean.fastq")
    log:
        out=temp(os.path.join(preprocessing_dir_merge, "logs/clean_headers/{sample}.out")),
        err=temp(os.path.join(preprocessing_dir_merge, "logs/clean_headers/{sample}.err"))
    threads: rule_resources["clean_headers_merge"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["clean_headers_merge"]["mem_mb"] * attempt
    retries: 3
    shell:
        """
        set -euo pipefail

        # Create directories
        mkdir -p $(dirname {log.out})
        mkdir -p $(dirname {output.clean_merged})

        # Create log header
        echo "Cleaning headers for {wildcards.sample}" > {log.out}
        echo "Started: $(date)" >> {log.out}
        echo "Host: $(hostname)" >> {log.out}

        # Simple sed replacement with error handling
        if sed 's/ /_/g' {input.merged_reads} > {output.clean_merged} 2>> {log.err}; then
            echo "Completed: $(date)" >> {log.out}
            echo "Output file size: $(numfmt --to=iec $(stat -c%s {output.clean_merged}))" >> {log.out}
            echo "--------------------------------------------" >> {log.out}
        else
            echo "FAILED: $(date)" >> {log.out}
            exit 1
        fi
        """

rule aggregate_clean_headers_logs_merge:
    input:
        sample_logs=expand(os.path.join(preprocessing_dir_merge, "logs/clean_headers/{sample}.out"), sample=list(samples.keys()))
    output:
        combined_log=os.path.join(preprocessing_dir_merge, "logs/clean_headers/clean_headers.log") 
    log:
        out=os.path.join(preprocessing_dir_merge, "logs/clean_headers/aggregate_clean_headers.out"),
        err=os.path.join(preprocessing_dir_merge, "logs/clean_headers/aggregate_clean_headers.err")
    threads: 1
    resources:
        mem_mb=3076
    retries: 3
    shell:
        """
        set -euo pipefail

        # Create header for aggregated log file
        echo "===============================================" > {output.combined_log}
        echo "Aggregated clean headers logs - Created $(date)" >> {output.combined_log}
        echo "===============================================" >> {output.combined_log}
        echo "" >> {output.combined_log}

        # Concatenate all sample logs into a combined log
        cat {input.sample_logs} >> {output.combined_log} 2>> {log.err}
        echo "Aggregation complete." >> {log.out}
        """
		
# ===== PREPROCESSING RULES - CONCAT MODE =====
rule fastp_pe_concat:
    input:
        R1=lambda wildcards: samples[wildcards.sample]["R1"],
        R2=lambda wildcards: samples[wildcards.sample]["R2"]
    output:
        R1_trimmed=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_R1_trimmed.fastq"),
        R2_trimmed=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_R2_trimmed.fastq"),
        report=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_fastp_report.html"),
        json=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_fastp_report.json")
    log:
        out=os.path.join(preprocessing_dir_concat, "logs/fastp/{sample}_fastp.out"),
        err=os.path.join(preprocessing_dir_concat, "logs/fastp/{sample}_fastp.err")
    threads: rule_resources["fastp_qc"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["fastp_qc"]["mem_mb"] * attempt
    retries: 3
    shell:
        """
        set -euo pipefail

        # Create output directories
        mkdir -p $(dirname {output.R1_trimmed})
        mkdir -p $(dirname {output.report})
        
        # Log job start
        echo "======== Fastp Concat Processing ========" > {log.out}
        echo "Sample: {wildcards.sample}" >> {log.out}
        echo "Started: $(date)" >> {log.out}
        echo "Input files: {input.R1}, {input.R2}" >> {log.out}
        echo "Threads: {threads}" >> {log.out}
        echo "=========================================" >> {log.out}
        
        # Check input files exist
        if [[ ! -f "{input.R1}" ]] || [[ ! -f "{input.R2}" ]]; then
            echo "ERROR: Input files not found" >> {log.out}
            exit 1
        fi

        # Log input file sizes
        echo "Input file sizes:" >> {log.out}
        echo "  R1: $(numfmt --to=iec $(stat -c%s {input.R1}))" >> {log.out}
        echo "  R2: $(numfmt --to=iec $(stat -c%s {input.R2}))" >> {log.out}
        
        # Run fastp
        echo "Running fastp: $(date)" >> {log.out}
        if fastp -i {input.R1} -I {input.R2} \\
                 -o {output.R1_trimmed} -O {output.R2_trimmed} \\
                 -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA \\
                 -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT \\
                 --dedup \\
                 --trim_poly_g \\
                 --thread {threads} \\
                 -h {output.report} -j {output.json} \\
                 >> {log.out} 2>> {log.err}; then
            echo "Fastp completed successfully: $(date)" >> {log.out}
        else
            echo "ERROR: Fastp failed with exit code $?" >> {log.out}
            exit 1
        fi
        
        # Validate outputs exist
        if [[ ! -s {output.R1_trimmed} ]] || [[ ! -s {output.R2_trimmed} ]]; then
            echo "ERROR: Output files are empty or missing" >> {log.out}
            exit 1
        fi
        
        # Clean headers
        echo "Cleaning headers: $(date)" >> {log.out}
        perl -i -pe 's/ /_/g' {output.R1_trimmed}
        perl -i -pe 's/ /_/g' {output.R2_trimmed}
        
        # Log output file sizes
        echo "Output file sizes:" >> {log.out}
        echo "  R1_trimmed: $(numfmt --to=iec $(stat -c%s {output.R1_trimmed}))" >> {log.out}
        echo "  R2_trimmed: $(numfmt --to=iec $(stat -c%s {output.R2_trimmed}))" >> {log.out}
        echo "Processing completed: $(date)" >> {log.out}
        """

rule fastq_concat:
    input:
        R1=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_R1_trimmed.fastq"),
        R2=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_R2_trimmed.fastq")
    output:
        temp(os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_concat.fastq"))
    log:
        out=os.path.join(preprocessing_dir_concat, "logs/concat/{sample}.out")
    threads: rule_resources["fastq_concat"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["fastq_concat"]["mem_mb"] * attempt
    retries: 3
    shell:
        """
        set -euo pipefail
                
        echo "Concatenating files for {wildcards.sample}" > {log.out}
        echo "Started: $(date)" >> {log.out}

        # Concatenation
        (cat {input.R1} && cat {input.R2}) > {output} 2>> {log.out}

        echo "Completed: $(date)" >> {log.out}
        echo "Output file size: $(numfmt --to=iec $(stat -c%s {output}))" >> {log.out}
        """

rule gzip_trimmed:
    input:
        R1=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_R1_trimmed.fastq"),
        R2=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_R2_trimmed.fastq"),
        concat=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_concat.fastq")
    output:
        R1_gz=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_R1_trimmed.fastq.gz"),
        R2_gz=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_R2_trimmed.fastq.gz")
    log:
        out=os.path.join(preprocessing_dir_concat, "logs/gzip/{sample}.out")
    threads: 1  # gzip is single-threaded
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["gzip_trimmed"]["mem_mb"] * attempt
    retries: 3
    shell:
        """
        set -euo pipefail
        
        echo "Compressing trimmed files for {wildcards.sample}" > {log.out}
        echo "Started: $(date)" >> {log.out}
        
        # Compress the trimmed files using gzip
        echo "Compressing R1 file with gzip..." >> {log.out}
        gzip -c {input.R1} > {output.R1_gz} 2>> {log.out}
        
        echo "Compressing R2 file with gzip..." >> {log.out}
        gzip -c {input.R2} > {output.R2_gz} 2>> {log.out}
        
        # Remove the original uncompressed files
        echo "Removing original uncompressed files..." >> {log.out}
        rm {input.R1} {input.R2}
        
        echo "Compression completed: $(date)" >> {log.out}
        echo "Compressed file sizes:" >> {log.out}
        echo "  R1: $(numfmt --to=iec $(stat -c%s {output.R1_gz}))" >> {log.out}
        echo "  R2: $(numfmt --to=iec $(stat -c%s {output.R2_gz}))" >> {log.out}
        """

rule aggregate_concat_logs:
    input:
        sample_logs=expand(os.path.join(preprocessing_dir_concat, "logs/concat/{sample}.out"), sample=list(samples.keys()))
    output:
        combined_log=os.path.join(preprocessing_dir_concat, "logs/concat/concat_reads.log")
    log:
        out=os.path.join(preprocessing_dir_concat, "logs/concat/aggregate_concat.out"),
        err=os.path.join(preprocessing_dir_concat, "logs/concat/aggregate_concat.err")
    threads: 1
    resources:
        mem_mb=3076
    retries: 3
    shell:
        """
        set -euo pipefail

        echo "===============================================" > {output.combined_log}
        echo "Aggregated concat logs - Created $(date)" >> {output.combined_log}
        echo "===============================================" >> {output.combined_log}
        echo "" >> {output.combined_log}

        # Concatenate all sample logs into a combined log
        cat {input.sample_logs} >> {output.combined_log} 2>> {log.err}
        echo "Aggregation complete." >> {log.out}
        """

rule quality_trim:
    input:
        os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_concat.fastq")
    output:
        concat_trimmed=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_concat_trimmed.fq"),
        report=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_concat.fastq_trimming_report.txt")
    params:
        outdir=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}"),
        report_dir=os.path.join(preprocessing_dir_concat, "trimmed_data/")
    log:
        out=os.path.join(preprocessing_dir_concat, "logs/trim_galore/{sample}.out"),
        err=os.path.join(preprocessing_dir_concat, "logs/trim_galore/{sample}.err")
    threads: rule_resources["quality_trim"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["quality_trim"]["mem_mb"] * attempt
    retries: 3
    shell:
        """
        set -euo pipefail

        echo "Quality trimming for {wildcards.sample}" > {log.out}
        echo "Started: $(date)" >> {log.out}
        echo "Input file: {input}" >> {log.out}

        # Run trim_galore
        trim_galore --cores {threads} \\
                    --dont_gzip \\
                    --output_dir {params.outdir} \\
                    --basename {wildcards.sample}_concat \\
                    {input} >> {log.out} 2>> {log.err}

        # Log job completion
        echo "Completed: $(date)" >> {log.out}
        echo "Output file size: $(numfmt --to=iec $(stat -c%s {output[0]}))" >> {log.out}
        """

if downsampling_enabled:
    rule downsample_concat:
        input:
            concat_trimmed=os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_concat_trimmed.fq")
        output:
            downsampled=temp(os.path.join(preprocessing_dir_concat, "trimmed_data/{sample}/{sample}_concat_trimmed_downsampled.fastq"))
        params:
            max_reads=config["downsampling"]["max_reads"] * 2  # Multiply by 2 because concat mode has R1+R2 reads
        log:
            out=os.path.join(preprocessing_dir_concat, "logs/downsample/{sample}.out"),
            err=os.path.join(preprocessing_dir_concat, "logs/downsample/{sample}.err")
        threads: rule_resources["downsample"]["threads"]
        resources:
            mem_mb=lambda wildcards, attempt: rule_resources["downsample"]["mem_mb"] * attempt
        retries: 3
        shell:
            """
            set -euo pipefail

            echo "Downsampling concatenated trimmed reads for {wildcards.sample}" > {log.out}
            echo "Started: $(date)" >> {log.out}
            echo "Target reads: {params.max_reads} (2x read pairs for concat mode)" >> {log.out}

            reformat.sh in={input.concat_trimmed} \\
                        out={output.downsampled} \\
                        samplereadstarget={params.max_reads} \\
                        sampleseed=12345 \\
                        >> {log.out} 2>> {log.err}

            echo "Completed: $(date)" >> {log.out}
            echo "Output file size: $(numfmt --to=iec $(stat -c%s {output.downsampled}))" >> {log.out}
            """

rule aggregate_trim_galore_logs:
    input:
        sample_logs=expand(os.path.join(preprocessing_dir_concat, "logs/trim_galore/{sample}.out"), sample=list(samples.keys()))
    output:
        combined_log=os.path.join(preprocessing_dir_concat, "logs/trim_galore/trim_galore.log")
    log:
        out=os.path.join(preprocessing_dir_concat, "logs/aggregate_trim_galore.out"),
        err=os.path.join(preprocessing_dir_concat, "logs/aggregate_trim_galore.err")
    threads: 1
    resources:
        mem_mb=3076
    shell:
        """
        set -euo pipefail

        echo "===============================================" > {output.combined_log}
        echo "Aggregated trim_galore logs - Created $(date)" >> {output.combined_log}
        echo "===============================================" >> {output.combined_log}
        echo "" >> {output.combined_log}

        # Concatenate all sample logs into a combined log
        cat {input.sample_logs} >> {output.combined_log} 2>> {log.err}
        echo "Aggregation complete." >> {log.out}
        """
		
# ===== MGE RULES FOR BARCODE RECOVERY =====

# MGE rule for merge mode
rule MitoGeneExtractor_merge:
    input:
        DNA=get_mge_input_merge,
        AA=get_protein_reference
    output:
        alignment=os.path.join(barcode_recovery_dir_merge, "alignment/{sample}_r_{r}_s_{s}_align_{sample}.fas"),
        consensus=os.path.join(barcode_recovery_dir_merge, "consensus/{sample}_r_{r}_s_{s}_con_{sample}.fas")
    log:
        out=os.path.join(barcode_recovery_dir_merge, "out/{sample}_r_{r}_s_{s}_summary.out"),
        err=os.path.join(barcode_recovery_dir_merge, "err/{sample}_r_{r}_s_{s}_summary.err")
    params:
        mge_executor=config["mge_path"],
        n=config["n"],
        C=config["C"],
        t=config["t"],		
        output_dir=barcode_recovery_dir_merge,
        vulgar_dir=lambda wildcards: os.path.join(barcode_recovery_dir_merge, f"logs/mge/{wildcards.sample}_r_{wildcards.r}_s_{wildcards.s}/")
    threads: rule_resources["MitoGeneExtractor"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["MitoGeneExtractor"]["mem_mb"] * attempt,
        slurm_partition=rule_resources["MitoGeneExtractor"]["partition"]
    retries: 4
    shell:
        """
        set -euo pipefail

        # Check if input file is empty or missing
        if [ ! -s "{input.DNA}" ]; then
            echo "===== EMPTY/MISSING INPUT DETECTED =====" >> {log.out}
            echo "Started: $(date)" >> {log.out}
            echo "Sample: {wildcards.sample}" >> {log.out}
            echo "Mode: merge" >> {log.out}
            echo "Input file: {input.DNA}" >> {log.out}
            
            if [ ! -f "{input.DNA}" ]; then
                echo "Input file does not exist" >> {log.out}
            else
                FILE_SIZE=$(stat -c%s "{input.DNA}" 2>/dev/null || echo "unknown")
                echo "Input file size: $FILE_SIZE bytes" >> {log.out}
                echo "File is empty - likely all reads were filtered out during trimming" >> {log.out}
            fi
            
            echo "Creating mock outputs..." >> {log.out}
            echo ">Consensus__{wildcards.sample}" > {output.consensus}
            touch {output.alignment}
            touch {log.err}
            echo "Mock outputs created successfully: $(date)" >> {log.out}
            echo "=========================================" >> {log.out}
            exit 0
        fi

        # Create vulgar directory if it doesn't exist
        mkdir -p {params.vulgar_dir}
        
        # Job info and file analysis
        echo "===== Job Info =====" >> {log.out}
        echo "Started: $(date)" >> {log.out}
        echo "Node: $(hostname)" >> {log.out}
        echo "Memory allocated: $(({resources.mem_mb} / 1024))GB" >> {log.out}
        echo "Threads: {threads}" >> {log.out}
        echo "Mode: merge" >> {log.out}
        echo "Sample: {wildcards.sample}" >> {log.out}
        
        # Accurate file size analysis (using stat - the reliable method)
        FILE_SIZE_BYTES=$(stat -c%s "{input.DNA}")
        FILE_SIZE_GB=$(echo "scale=2; $FILE_SIZE_BYTES / 1024 / 1024 / 1024" | bc)
        NUM_SEQUENCES=$(grep -c "^@" "{input.DNA}" || echo "0")
        
        echo "===== Input Files =====" >> {log.out}
        echo "Input file: {input.DNA}" >> {log.out}
        echo "File size: ${{FILE_SIZE_GB}}GB ($FILE_SIZE_BYTES bytes)" >> {log.out}
        echo "Number of sequences: $NUM_SEQUENCES" >> {log.out}        
        echo "Input AA file: {input.AA}" >> {log.out}
            
        # Run MitoGeneExtractor
        echo "===== Starting MitoGeneExtractor =====" >> {log.out}
        echo "Command: {params.mge_executor} -q {input.DNA} -p {input.AA} -o {params.output_dir}/alignment/{wildcards.sample}_r_{wildcards.r}_s_{wildcards.s}_align_ -c {params.output_dir}/consensus/{wildcards.sample}_r_{wildcards.r}_s_{wildcards.s}_con_ -V {params.vulgar_dir} -r {wildcards.r} -s {wildcards.s} -n {params.n} -C {params.C} -t {params.t} --temporaryDirectory {params.output_dir} --verbosity 10" >> {log.out}
        echo "=======================================" >> {log.out}
        
        time {params.mge_executor} \\
        -q {input.DNA} -p {input.AA} \\
        -o {params.output_dir}/alignment/{wildcards.sample}_r_{wildcards.r}_s_{wildcards.s}_align_ \\
        -c {params.output_dir}/consensus/{wildcards.sample}_r_{wildcards.r}_s_{wildcards.s}_con_ \\
        -V {params.vulgar_dir} \\
        -r {wildcards.r} -s {wildcards.s} \\
        -n {params.n} \\
        -C {params.C} \\
        -t {params.t} \\
        --temporaryDirectory {params.output_dir} \\
        --verbosity 10 \\
        >> {log.out} 2>> {log.err} || {{
            EXIT_CODE=$?
            echo "===== MitoGeneExtractor Failed =====" >> {log.out}
            echo "Exit code: $EXIT_CODE" >> {log.out}
            echo "Finished: $(date)" >> {log.out}
            
            # Enhanced failure analysis
            if [ $EXIT_CODE -eq 137 ]; then
                echo "ANALYSIS: Process killed (likely out of memory)" >> {log.out}
                echo "  Current allocation: $(({resources.mem_mb} / 1024))GB" >> {log.out}
                echo "  File size: ${{FILE_SIZE_GB}}GB" >> {log.out}
                echo "  Sequences processed: $NUM_SEQUENCES" >> {log.out}
            elif [ $EXIT_CODE -eq 124 ]; then
                echo "ANALYSIS: Process timed out" >> {log.out}
            elif [ $EXIT_CODE -eq 1 ]; then
                echo "ANALYSIS: General error - check error log for details" >> {log.out}
            else
                echo "ANALYSIS: Unknown failure (exit code $EXIT_CODE)" >> {log.out}
            fi
            
            echo "Final system state:" >> {log.out}
            free -h >> {log.out}
            echo "===================================" >> {log.out}
            
            exit $EXIT_CODE
        }}

        echo "===== Job Completed Successfully =====" >> {log.out}
        echo "Finished: $(date)" >> {log.out}
        echo "Final memory usage:" >> {log.out}
        free -h >> {log.out}
        
        # Validate and report output files
        if [ -f "{output.consensus}" ] && [ -f "{output.alignment}" ]; then
            CONSENSUS_SIZE=$(stat -c%s {output.consensus})
            ALIGNMENT_SIZE=$(stat -c%s {output.alignment})
            CONSENSUS_LINES=$(wc -l < {output.consensus})
            
            echo "===== Output Summary =====" >> {log.out}
            echo "Consensus file: $(numfmt --to=iec $CONSENSUS_SIZE) ($CONSENSUS_LINES lines)" >> {log.out}
            echo "Alignment file: $(numfmt --to=iec $ALIGNMENT_SIZE)" >> {log.out}
            
            if [ $CONSENSUS_LINES -gt 1 ]; then
                echo "✓ Consensus contains sequence data" >> {log.out}
            else
                echo "⚠ WARNING: Consensus may be header-only" >> {log.out}
            fi
            echo "===========================" >> {log.out}
        else
            echo "ERROR: Expected output files not found" >> {log.out}
            echo "Expected files:" >> {log.out}
            echo "  Consensus: {output.consensus}" >> {log.out}
            echo "  Alignment: {output.alignment}" >> {log.out}
            exit 1
        fi
        """


# MGE rule for concat mode
rule MitoGeneExtractor_concat:
    input:
        DNA=get_mge_input_concat,
        AA=get_protein_reference
    output:
        alignment=os.path.join(barcode_recovery_dir_concat, "alignment/{sample}_r_{r}_s_{s}_align_{sample}.fas"),
        consensus=os.path.join(barcode_recovery_dir_concat, "consensus/{sample}_r_{r}_s_{s}_con_{sample}.fas")
    log:
        out=os.path.join(barcode_recovery_dir_concat, "out/{sample}_r_{r}_s_{s}_summary.out"),
        err=os.path.join(barcode_recovery_dir_concat, "err/{sample}_r_{r}_s_{s}_summary.err")
    params:
        mge_executor=config["mge_path"],
        n=config["n"],
        C=config["C"],
        t=config["t"],		
        output_dir=barcode_recovery_dir_concat,
        vulgar_dir=lambda wildcards: os.path.join(barcode_recovery_dir_concat, f"logs/mge/{wildcards.sample}_r_{wildcards.r}_s_{wildcards.s}/")
    threads: rule_resources["MitoGeneExtractor"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["MitoGeneExtractor"]["mem_mb"] * attempt,
        slurm_partition=rule_resources["MitoGeneExtractor"]["partition"]
    retries: 4
    shell:
        """
        set -euo pipefail

        # Check if input file is empty or missing
        if [ ! -s "{input.DNA}" ]; then
            echo "===== EMPTY/MISSING INPUT DETECTED =====" >> {log.out}
            echo "Started: $(date)" >> {log.out}
            echo "Sample: {wildcards.sample}" >> {log.out}
            echo "Mode: concat" >> {log.out}
            echo "Input file: {input.DNA}" >> {log.out}
            
            if [ ! -f "{input.DNA}" ]; then
                echo "Input file does not exist" >> {log.out}
            else
                FILE_SIZE=$(stat -c%s "{input.DNA}" 2>/dev/null || echo "unknown")
                echo "Input file size: $FILE_SIZE bytes" >> {log.out}
                echo "File is empty - likely all reads were filtered out during trimming" >> {log.out}
            fi
            
            echo "Creating mock outputs..." >> {log.out}
            echo ">Consensus__{wildcards.sample}" > {output.consensus}
            touch {output.alignment}
            touch {log.err}
            echo "Mock outputs created successfully: $(date)" >> {log.out}
            echo "=========================================" >> {log.out}
            exit 0
        fi

        # Create vulgar directory if it doesn't exist
        mkdir -p {params.vulgar_dir}
        
        # Job info and file analysis
        echo "===== Job Info =====" >> {log.out}
        echo "Started: $(date)" >> {log.out}
        echo "Node: $(hostname)" >> {log.out}
        echo "Memory allocated: $(({resources.mem_mb} / 1024))GB" >> {log.out}
        echo "Threads: {threads}" >> {log.out}
        echo "Mode: concat" >> {log.out}
        echo "Sample: {wildcards.sample}" >> {log.out}
        
        # Accurate file size analysis (using stat - the reliable method)
        FILE_SIZE_BYTES=$(stat -c%s "{input.DNA}")
        FILE_SIZE_GB=$(echo "scale=2; $FILE_SIZE_BYTES / 1024 / 1024 / 1024" | bc)
        NUM_SEQUENCES=$(grep -c "^@" "{input.DNA}" || echo "0")
        
        echo "===== Input Files =====" >> {log.out}
        echo "Input file: {input.DNA}" >> {log.out}
        echo "File size: ${{FILE_SIZE_GB}}GB ($FILE_SIZE_BYTES bytes)" >> {log.out}
        echo "Number of sequences: $NUM_SEQUENCES" >> {log.out}        
        echo "Input AA file: {input.AA}" >> {log.out}
            
        # Run MitoGeneExtractor
        echo "===== Starting MitoGeneExtractor =====" >> {log.out}
        echo "Command: {params.mge_executor} -q {input.DNA} -p {input.AA} -o {params.output_dir}/alignment/{wildcards.sample}_r_{wildcards.r}_s_{wildcards.s}_align_ -c {params.output_dir}/consensus/{wildcards.sample}_r_{wildcards.r}_s_{wildcards.s}_con_ -V {params.vulgar_dir} -r {wildcards.r} -s {wildcards.s} -n {params.n} -C {params.C} -t {params.t} --temporaryDirectory {params.output_dir} --verbosity 10" >> {log.out}
        echo "=======================================" >> {log.out}
        
        time {params.mge_executor} \\
        -q {input.DNA} -p {input.AA} \\
        -o {params.output_dir}/alignment/{wildcards.sample}_r_{wildcards.r}_s_{wildcards.s}_align_ \\
        -c {params.output_dir}/consensus/{wildcards.sample}_r_{wildcards.r}_s_{wildcards.s}_con_ \\
        -V {params.vulgar_dir} \\
        -r {wildcards.r} -s {wildcards.s} \\
        -n {params.n} \\
        -C {params.C} \\
        -t {params.t} \\
        --temporaryDirectory {params.output_dir} \\
        --verbosity 10 \\
        >> {log.out} 2>> {log.err} || {{
            EXIT_CODE=$?
            echo "===== MitoGeneExtractor Failed =====" >> {log.out}
            echo "Exit code: $EXIT_CODE" >> {log.out}
            echo "Finished: $(date)" >> {log.out}
            
            # Enhanced failure analysis
            if [ $EXIT_CODE -eq 137 ]; then
                echo "ANALYSIS: Process killed (likely out of memory)" >> {log.out}
                echo "  Current allocation: $(({resources.mem_mb} / 1024))GB" >> {log.out}
                echo "  File size: ${{FILE_SIZE_GB}}GB" >> {log.out}
                echo "  Sequences processed: $NUM_SEQUENCES" >> {log.out}
            elif [ $EXIT_CODE -eq 124 ]; then
                echo "ANALYSIS: Process timed out" >> {log.out}
            elif [ $EXIT_CODE -eq 1 ]; then
                echo "ANALYSIS: General error - check error log for details" >> {log.out}
            else
                echo "ANALYSIS: Unknown failure (exit code $EXIT_CODE)" >> {log.out}
            fi
            
            echo "Final system state:" >> {log.out}
            free -h >> {log.out}
            echo "===================================" >> {log.out}
            
            exit $EXIT_CODE
        }}

        echo "===== Job Completed Successfully =====" >> {log.out}
        echo "Finished: $(date)" >> {log.out}
        echo "Final memory usage:" >> {log.out}
        free -h >> {log.out}
        
        # Validate and report output files
        if [ -f "{output.consensus}" ] && [ -f "{output.alignment}" ]; then
            CONSENSUS_SIZE=$(stat -c%s {output.consensus})
            ALIGNMENT_SIZE=$(stat -c%s {output.alignment})
            CONSENSUS_LINES=$(wc -l < {output.consensus})
            
            echo "===== Output Summary =====" >> {log.out}
            echo "Consensus file: $(numfmt --to=iec $CONSENSUS_SIZE) ($CONSENSUS_LINES lines)" >> {log.out}
            echo "Alignment file: $(numfmt --to=iec $ALIGNMENT_SIZE)" >> {log.out}
            
            if [ $CONSENSUS_LINES -gt 1 ]; then
                echo "✓ Consensus contains sequence data" >> {log.out}
            else
                echo "⚠ WARNING: Consensus may be header-only" >> {log.out}
            fi
            echo "===========================" >> {log.out}
        else
            echo "ERROR: Expected output files not found" >> {log.out}
            echo "Expected files:" >> {log.out}
            echo "  Consensus: {output.consensus}" >> {log.out}
            echo "  Alignment: {output.alignment}" >> {log.out}
            exit 1
        fi
        """
		
# ===== CONSENSUS COMBINATION AND LOG RULES =====

# Rename headers in consensus files for merge mode
rule rename_and_combine_cons_merge:
    input:
        consensus_files=get_all_mge_consensus_files("merge")
    output:
        complete=os.path.join(barcode_recovery_dir_merge, "logs/rename_consensus/rename_complete.txt"),
        concat_cons=os.path.join(barcode_recovery_dir_merge, f"consensus/{run_name}_cons_combined-merge.fasta")
    log:
        os.path.join(barcode_recovery_dir_merge, "logs/rename_consensus/rename_fasta.log")
    params:
        script=os.path.join(workflow.basedir, "scripts/rename_headers.py"),
        preprocessing_mode="merge"
    threads: rule_resources["rename_and_combine_cons"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["rename_and_combine_cons"]["mem_mb"] * attempt
    retries: 3
    shell:
        """ 
        set -euo pipefail

		# Check all input files exist
        for file in {input.consensus_files}; do
            if [ ! -f "$file" ]; then
                echo "Error: Required input file does not exist: $file" >> {log}
                echo "Cannot proceed with renaming and combining consensus files." >> {log}
				echo "If any MitoGeneExtractor jobs failed, please rerun the workflow to generate those files." >> {log}

                exit 1
            fi
        done
		
        # Run script
        python {params.script} \\
            --input-files {input.consensus_files} \\
            --concatenated-consensus {output.concat_cons} \\
            --complete-file={output.complete} \\
            --log-file={log} \\
            --preprocessing-mode={params.preprocessing_mode} \\
            --threads={threads}
    
        # Touch completion file to ensure timestamp is updated
        touch {output.complete}
        """

# Rename headers in consensus files for concat mode
rule rename_and_combine_cons_concat:
    input:
        consensus_files=get_all_mge_consensus_files("concat")
    output:
        complete=os.path.join(barcode_recovery_dir_concat, "logs/rename_consensus/rename_complete.txt"),
        concat_cons=os.path.join(barcode_recovery_dir_concat, f"consensus/{run_name}_cons_combined-concat.fasta")
    log:
        os.path.join(barcode_recovery_dir_concat, "logs/rename_consensus/rename_fasta.log")
    params:
        script=os.path.join(workflow.basedir, "scripts/rename_headers.py"),
        preprocessing_mode="concat"
    threads: rule_resources["rename_and_combine_cons"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["rename_and_combine_cons"]["mem_mb"] * attempt
    retries: 3
    shell:
        """ 
        set -euo pipefail

		# Check all input files exist
        for file in {input.consensus_files}; do
            if [ ! -f "$file" ]; then
                echo "Error: Required input file does not exist: $file" >> {log}
                echo "Cannot proceed with renaming and combining consensus files." >> {log}
				echo "If any MitoGeneExtractor jobs failed, please rerun the workflow to generate those files." >> {log}
                exit 1
            fi
        done
		
        # Run script
        python {params.script} \\
            --input-files {input.consensus_files} \\
            --concatenated-consensus {output.concat_cons} \\
            --complete-file={output.complete} \\
            --log-file={log} \\
            --preprocessing-mode={params.preprocessing_mode} \\
            --threads={threads}
    
        # Touch completion file to ensure timestamp is updated
        touch {output.complete}
        """

# Remove exonerate intermediate files after consensus generation
rule remove_exonerate_intermediates:
    input:
        merge_consensus=os.path.join(barcode_recovery_dir_merge, f"consensus/{run_name}_cons_combined-merge.fasta"),
        concat_consensus=os.path.join(barcode_recovery_dir_concat, f"consensus/{run_name}_cons_combined-concat.fasta"),
        merge_rename_complete=os.path.join(barcode_recovery_dir_merge, "logs/rename_consensus/rename_complete.txt"),
        concat_rename_complete=os.path.join(barcode_recovery_dir_concat, "logs/rename_consensus/rename_complete.txt")
    output:
        merge_completion=os.path.join(barcode_recovery_dir_merge, "logs/exonerate_int_cleanup_complete.txt"),
        concat_completion=os.path.join(barcode_recovery_dir_concat, "logs/exonerate_int_cleanup_complete.txt")
    threads: 1
    resources:
        mem_mb=2048
    retries: 2
    run:     
        # Clean each mode directory and write separate completion files
        for mode_dir, mode_name, completion_file in [
            (barcode_recovery_dir_merge, "merge", output.merge_completion),
            (barcode_recovery_dir_concat, "concat", output.concat_completion)
        ]:
            removed_files = 0
            total_size_saved = 0
            
            with open(completion_file, 'w') as log:
                log.write(f"Starting exonerate intermediate file cleanup for {mode_name} mode: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                log.write(f"Mode directory: {mode_dir}\n\n")
                
                exonerate_pattern = os.path.join(mode_dir, "Concatenated_exonerate_input_*")
                exonerate_files = glob.glob(exonerate_pattern)
                
                if not exonerate_files:
                    log.write(f"No Concatenated_exonerate_input_* files found\n")
                else:
                    log.write(f"Found {len(exonerate_files)} files to remove:\n")
                    
                    for file_path in exonerate_files:
                        try:
                            file_size = os.path.getsize(file_path)
                            os.remove(file_path)
                            
                            size_mb = file_size / (1024 * 1024)
                            log.write(f"  - Removed: {os.path.basename(file_path)} ({size_mb:.2f} MB)\n")
                            
                            removed_files += 1
                            total_size_saved += file_size
                            
                        except OSError as e:
                            log.write(f"  - ERROR removing {file_path}: {e}\n")
                
                # Summary
                total_saved_mb = total_size_saved / (1024 * 1024)
                log.write(f"\nCLEANUP SUMMARY for {mode_name} mode:\n")
                log.write(f"  Total files removed: {removed_files}\n")
                log.write(f"  Total disk space saved: {total_saved_mb:.2f} MB\n")
                log.write(f"  Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
			
# Create list of alignment files for stats for merge mode
rule create_alignment_log_merge:
    input:
        alignment_files=lambda wildcards: glob.glob(os.path.join(barcode_recovery_dir_merge, "alignment/*_align_*.fas")),
        concat_cons=os.path.join(barcode_recovery_dir_merge, f"consensus/{run_name}_cons_combined-merge.fasta")
    output:
        alignment_log=os.path.join(barcode_recovery_dir_merge, "logs/mge/alignment_files.log")
    threads: 1
    resources:
        mem_mb=4096
    retries: 3
    run:
        alignment_files = input.alignment_files

        with open(output.alignment_log, 'w') as log_file:
            for file in alignment_files:
                log_file.write(f"{file}\n")

# Create list of alignment files for stats for concat mode
rule create_alignment_log_concat:
    input:
        alignment_files=lambda wildcards: glob.glob(os.path.join(barcode_recovery_dir_concat, "alignment/*_align_*.fas")),
        concat_cons=os.path.join(barcode_recovery_dir_concat, f"consensus/{run_name}_cons_combined-concat.fasta")
    output:
        alignment_log=os.path.join(barcode_recovery_dir_concat, "logs/mge/alignment_files.log")
    threads: 1
    resources:
        mem_mb=4096
    retries: 3
    run:
        alignment_files = input.alignment_files

        with open(output.alignment_log, 'w') as log_file:
            for file in alignment_files:
                log_file.write(f"{file}\n")
				

# ===== FASTA CLEANER RULES - MERGE MODE =====

# Sequential filtering rules (fasta_cleaner)
rule human_cox1_filter_merge:
    input:
        alignment_log=os.path.join(barcode_recovery_dir_merge, "logs/mge/alignment_files.log")
    output:
        filtered_files=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/01_human_filtered/human_filtered.txt"),
        metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/01_human_filtered/human_filter_metrics.csv")
    params:
        script=os.path.join(workflow.basedir, "scripts/01_human_cox1_filter.py"),
        output_dir=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/01_human_filtered"),
        human_threshold=fasta_cleaner_params["human_threshold"]
    log:
        os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner/01_human_filter.log")
    threads: rule_resources["human_cox1_filter"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["human_cox1_filter"]["mem_mb"] * attempt
    retries: 2
    shell:
        """
        set -euo pipefail

        mkdir -p {params.output_dir}
        
        python {params.script} \\
            --input-log {input.alignment_log} \\
            --output-dir {params.output_dir} \\
            --filtered-files-list {output.filtered_files} \\
            --metrics-csv {output.metrics} \\
            --human-threshold {params.human_threshold} \\
            --threads {threads} \\
            2>&1 | tee {log}
        """

rule at_content_filter_merge:
    input:
        human_filtered=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/01_human_filtered/human_filtered.txt")
    output:
        filtered_files=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/02_at_filtered/at_filtered.txt"),
        summary_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/02_at_filtered/at_filter_summary.csv")
    params:
        script=os.path.join(workflow.basedir, "scripts/02_at_content_filter.py"),
        output_dir=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/02_at_filtered"),
        at_threshold=fasta_cleaner_params["at_difference"],
        at_mode=fasta_cleaner_params["at_mode"],
        consensus_threshold=fasta_cleaner_params["consensus_threshold"]
    log:
        os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner/02_at_filter.log")
    threads: rule_resources["at_content_filter"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["at_content_filter"]["mem_mb"] * attempt
    retries: 2
    shell:
        """
        set -euo pipefail

        mkdir -p {params.output_dir}
        
        python -u {params.script} \\
            --input_files {input.human_filtered} \\
            --output_dir {params.output_dir} \\
	        --filtered-files-list {output.filtered_files} \\
            --at_threshold {params.at_threshold} \\
            --at_mode {params.at_mode} \\
            --consensus_threshold {params.consensus_threshold} \\
            --threads {threads} \\
	        2>&1 | tee {log}
        """

rule statistical_outlier_filter_merge:
    input:
        filtered_files=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/02_at_filtered/at_filtered.txt")
    output:
        filtered_files=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/03_outlier_filtered/outlier_filtered.txt"),
        summary_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/03_outlier_filtered/outlier_filter_summary_metrics.csv"),
        individual_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/03_outlier_filtered/outlier_filter_individual_metrics.csv")
    params:
        script=os.path.join(workflow.basedir, "scripts/03_statistical_outlier_filter.py"),
        output_dir=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/03_outlier_filtered"),
        outlier_percentile=fasta_cleaner_params["outlier_percentile"],
        consensus_threshold=fasta_cleaner_params["consensus_threshold"]
    log:
        os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner/03_outlier_filter.log")
    threads: rule_resources["statistical_outlier_filter"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["statistical_outlier_filter"]["mem_mb"] * attempt
    retries: 2
    shell:
        """
        set -euo pipefail

        mkdir -p {params.output_dir}
        
        python {params.script} \\
            --input-files-list {input.filtered_files} \\
            --output-dir {params.output_dir} \\
            --filtered-files-list {output.filtered_files} \\
            --summary-csv {output.individual_metrics} \\
            --metrics-csv {output.summary_metrics} \\
            --outlier-percentile {params.outlier_percentile} \\
            --consensus-threshold {params.consensus_threshold} \\
            --threads {threads} \\
            2>&1 | tee {log}
        """

if use_reference_filtering:
    # Version WITH reference filtering - MERGE MODE
    rule reference_filter_merge:
        input:
            filtered_files=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/03_outlier_filtered/outlier_filtered.txt")
        output:
            filtered_files=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/04_reference_filtered/reference_filtered.txt"),
            metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/04_reference_filtered/reference_filter_metrics.csv")
        params:
            script=os.path.join(workflow.basedir, "scripts/04_reference_filter.py"),
            output_dir=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/04_reference_filtered"),
            reference_dir=reference_dir,
            filter_mode=fasta_cleaner_params["reference_filter_mode"],
        log:
            os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner/04_reference_filter.log")
        threads: rule_resources["reference_filter"]["threads"]
        resources:
            mem_mb=lambda wildcards, attempt: rule_resources["reference_filter"]["mem_mb"] * attempt
        retries: 2
        shell:
            """
            set -euo pipefail

            mkdir -p {params.output_dir}

            python {params.script} \\
                --input-files-list {input.filtered_files} \\
                --output-dir {params.output_dir} \\
                --filtered-files-list {output.filtered_files} \\
                --metrics-csv {output.metrics} \\
                --reference-dir {params.reference_dir} \\
                --filter-mode {params.filter_mode} \\
                --threads {threads} \\
                2>&1 | tee {log}
            """

    rule consensus_generation_merge:
        input:
            filtered_files=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/04_reference_filtered/reference_filtered.txt")
        output:
            concat_consensus=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/cleaned_cons_combined.fasta"),
            consensus_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/05_cleaned_consensus/cleaned_cons_metrics-merge.csv")
        params:
            script=os.path.join(workflow.basedir, "scripts/05_consensus_generator.py"),
            output_dir=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/05_cleaned_consensus"),
            consensus_threshold=fasta_cleaner_params["consensus_threshold"],
            preprocessing_mode="merge"
        log:
            os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner/05_consensus_generation.log")
        threads: rule_resources["consensus_generation"]["threads"]
        resources:
            mem_mb=lambda wildcards, attempt: rule_resources["consensus_generation"]["mem_mb"] * attempt
        retries: 2
        shell:
            """
            set -euo pipefail

            python {params.script} \\
                --input-files-list {input.filtered_files} \\
                --output-dir {params.output_dir} \\
                --consensus-fasta {output.concat_consensus} \\
                --consensus-metrics {output.consensus_metrics} \\
                --consensus-threshold {params.consensus_threshold} \\
                --preprocessing-mode {params.preprocessing_mode} \\
                --threads {threads} \\
                2>&1 | tee {log}
            """

    rule aggregate_filter_metrics_merge:
        input:
            human_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/01_human_filtered/human_filter_metrics.csv"),
            at_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/02_at_filtered/at_filter_summary.csv"),
            outlier_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/03_outlier_filtered/outlier_filter_individual_metrics.csv"),
            reference_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/04_reference_filtered/reference_filter_metrics.csv"),
            consensus_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/05_cleaned_consensus/cleaned_cons_metrics-merge.csv")
        output:
            completion_flag=os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner_complete.txt"),
            combined_statistics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/combined_statistics.csv")
        params:
            script=os.path.join(workflow.basedir, "scripts/06_aggregate_filter_metrics.py"),
            output_dir=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner")
        log:
            os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner/06_aggregate_metrics.log")
        threads: 2
        resources:
            mem_mb=3076
        retries: 2
        shell:
            """
            set -euo pipefail

            python {params.script} \\
                --human-metrics {input.human_metrics} \\
                --at-metrics {input.at_metrics} \\
                --outlier-metrics {input.outlier_metrics} \\
                --reference-metrics {input.reference_metrics} \\
                --consensus-metrics {input.consensus_metrics} \\
                --output-dir {params.output_dir} \\
                --combined-statistics {output.combined_statistics} \\
                --threads {threads} \\
                2>&1 | tee {log}
                
            touch {output.completion_flag}
            """
else:
    # Version WITHOUT reference filtering
    rule consensus_generation_merge:
        input:
            filtered_files=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/03_outlier_filtered/outlier_filtered.txt")
        output:
            concat_consensus=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/cleaned_cons_combined.fasta"),
            consensus_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/05_cleaned_consensus/cleaned_cons_metrics-merge.csv")
        params:
            script=os.path.join(workflow.basedir, "scripts/05_consensus_generator.py"),
            output_dir=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/05_cleaned_consensus"),
            consensus_threshold=fasta_cleaner_params["consensus_threshold"],
            preprocessing_mode="merge"
        log:
            os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner/05_consensus_generation.log")
        threads: rule_resources["consensus_generation"]["threads"]
        resources:
            mem_mb=lambda wildcards, attempt: rule_resources["consensus_generation"]["mem_mb"] * attempt
        retries: 2
        shell:
            """
            set -euo pipefail

            python {params.script} \\
                --input-files-list {input.filtered_files} \\
                --output-dir {params.output_dir} \\
                --consensus-fasta {output.concat_consensus} \\
                --consensus-metrics {output.consensus_metrics} \\
                --consensus-threshold {params.consensus_threshold} \\
                --preprocessing-mode {params.preprocessing_mode} \\
                --threads {threads} \\
                2>&1 | tee {log}
            """

    rule aggregate_filter_metrics_merge:
        input:
            human_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/01_human_filtered/human_filter_metrics.csv"),
            at_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/02_at_filtered/at_filter_summary.csv"),
            outlier_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/03_outlier_filtered/outlier_filter_individual_metrics.csv"),
            consensus_metrics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/05_cleaned_consensus/cleaned_cons_metrics-merge.csv")
        output:
            completion_flag=os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner_complete.txt"),
            combined_statistics=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/combined_statistics.csv")
        params:
            script=os.path.join(workflow.basedir, "scripts/06_aggregate_filter_metrics.py"),
            output_dir=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner")
        log:
            os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner/06_aggregate_metrics.log")
        threads: 2
        resources:
            mem_mb=3076
        retries: 2
        shell:
            """
            set -euo pipefail

            python {params.script} \\
                --human-metrics {input.human_metrics} \\
                --at-metrics {input.at_metrics} \\
                --outlier-metrics {input.outlier_metrics} \\
                --consensus-metrics {input.consensus_metrics} \\
                --output-dir {params.output_dir} \\
                --combined-statistics {output.combined_statistics} \\
                --threads {threads} \\
                2>&1 | tee {log}
                
            touch {output.completion_flag}
            """
			
# ===== FASTA CLEANER RULES - CONCAT MODE =====

# Sequential filtering rules (fasta_cleaner)
rule human_cox1_filter_concat:
    input:
        alignment_log=os.path.join(barcode_recovery_dir_concat, "logs/mge/alignment_files.log")
    output:
        filtered_files=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/01_human_filtered/human_filtered.txt"),
        metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/01_human_filtered/human_filter_metrics.csv")
    params:
        script=os.path.join(workflow.basedir, "scripts/01_human_cox1_filter.py"),
        output_dir=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/01_human_filtered"),
        human_threshold=fasta_cleaner_params["human_threshold"]
    log:
        os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner/01_human_filter.log")
    threads: rule_resources["human_cox1_filter"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["human_cox1_filter"]["mem_mb"] * attempt
    retries: 2
    shell:
        """
        set -euo pipefail

        mkdir -p {params.output_dir}
        
        python {params.script} \\
            --input-log {input.alignment_log} \\
            --output-dir {params.output_dir} \\
            --filtered-files-list {output.filtered_files} \\
            --metrics-csv {output.metrics} \\
            --human-threshold {params.human_threshold} \\
            --threads {threads} \\
            2>&1 | tee {log}
        """

rule at_content_filter_concat:
    input:
        human_filtered=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/01_human_filtered/human_filtered.txt")
    output:
        filtered_files=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/02_at_filtered/at_filtered.txt"),
        summary_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/02_at_filtered/at_filter_summary.csv")
    params:
        script=os.path.join(workflow.basedir, "scripts/02_at_content_filter.py"),
        output_dir=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/02_at_filtered"),
        at_threshold=fasta_cleaner_params["at_difference"],
        at_mode=fasta_cleaner_params["at_mode"],
        consensus_threshold=fasta_cleaner_params["consensus_threshold"]
    log:
        os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner/02_at_filter.log")
    threads: rule_resources["at_content_filter"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["at_content_filter"]["mem_mb"] * attempt
    retries: 2
    shell:
        """
        set -euo pipefail

        mkdir -p {params.output_dir}
        
        python -u {params.script} \\
            --input_files {input.human_filtered} \\
            --output_dir {params.output_dir} \\
	        --filtered-files-list {output.filtered_files} \\
            --at_threshold {params.at_threshold} \\
            --at_mode {params.at_mode} \\
            --consensus_threshold {params.consensus_threshold} \\
            --threads {threads} \\
	        2>&1 | tee {log}
        """

rule statistical_outlier_filter_concat:
    input:
        filtered_files=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/02_at_filtered/at_filtered.txt")
    output:
        filtered_files=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/03_outlier_filtered/outlier_filtered.txt"),
        summary_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/03_outlier_filtered/outlier_filter_summary_metrics.csv"),
        individual_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/03_outlier_filtered/outlier_filter_individual_metrics.csv")
    params:
        script=os.path.join(workflow.basedir, "scripts/03_statistical_outlier_filter.py"),
        output_dir=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/03_outlier_filtered"),
        outlier_percentile=fasta_cleaner_params["outlier_percentile"],
        consensus_threshold=fasta_cleaner_params["consensus_threshold"]
    log:
        os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner/03_outlier_filter.log")
    threads: rule_resources["statistical_outlier_filter"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["statistical_outlier_filter"]["mem_mb"] * attempt
    retries: 2
    shell:
        """
        set -euo pipefail

        mkdir -p {params.output_dir}
        
        python {params.script} \\
            --input-files-list {input.filtered_files} \\
            --output-dir {params.output_dir} \\
            --filtered-files-list {output.filtered_files} \\
            --summary-csv {output.individual_metrics} \\
            --metrics-csv {output.summary_metrics} \\
            --outlier-percentile {params.outlier_percentile} \\
            --consensus-threshold {params.consensus_threshold} \\
            --threads {threads} \\
            2>&1 | tee {log}
        """
		
if use_reference_filtering:
    # Version WITH reference filtering - CONCAT MODE  
    rule reference_filter_concat:
        input:
            filtered_files=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/03_outlier_filtered/outlier_filtered.txt")
        output:
            filtered_files=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/04_reference_filtered/reference_filtered.txt"),
            metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/04_reference_filtered/reference_filter_metrics.csv")
        params:
            script=os.path.join(workflow.basedir, "scripts/04_reference_filter.py"),
            output_dir=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/04_reference_filtered"),
            reference_dir=reference_dir,
            filter_mode=fasta_cleaner_params["reference_filter_mode"],
        log:
            os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner/04_reference_filter.log")
        threads: rule_resources["reference_filter"]["threads"]
        resources:
            mem_mb=lambda wildcards, attempt: rule_resources["reference_filter"]["mem_mb"] * attempt
        retries: 2
        shell:
            """
            set -euo pipefail
            
            mkdir -p {params.output_dir}
            
            python {params.script} \\
                --input-files-list {input.filtered_files} \\
                --output-dir {params.output_dir} \\
                --filtered-files-list {output.filtered_files} \\
                --metrics-csv {output.metrics} \\
                --reference-dir {params.reference_dir} \\
                --filter-mode {params.filter_mode} \\
                --threads {threads} \\
                2>&1 | tee {log}
            """

    rule consensus_generation_concat:
        input:
            filtered_files=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/04_reference_filtered/reference_filtered.txt")
        output:
            concat_consensus=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/cleaned_cons_combined.fasta"),
            consensus_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/05_cleaned_consensus/cleaned_cons_metrics-concat.csv")
        params:
            script=os.path.join(workflow.basedir, "scripts/05_consensus_generator.py"),
            output_dir=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/05_cleaned_consensus"),
            consensus_threshold=fasta_cleaner_params["consensus_threshold"],
            preprocessing_mode="concat"
        log:
            os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner/05_consensus_generation.log")
        threads: rule_resources["consensus_generation"]["threads"]
        resources:
            mem_mb=lambda wildcards, attempt: rule_resources["consensus_generation"]["mem_mb"] * attempt
        retries: 2
        shell:
            """
            set -euo pipefail

            python {params.script} \\
                --input-files-list {input.filtered_files} \\
                --output-dir {params.output_dir} \\
                --consensus-fasta {output.concat_consensus} \\
                --consensus-metrics {output.consensus_metrics} \\
                --consensus-threshold {params.consensus_threshold} \\
                --preprocessing-mode {params.preprocessing_mode} \\
                --threads {threads} \\
                2>&1 | tee {log}
            """

    rule aggregate_filter_metrics_concat:
        input:
            human_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/01_human_filtered/human_filter_metrics.csv"),
            at_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/02_at_filtered/at_filter_summary.csv"),
            outlier_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/03_outlier_filtered/outlier_filter_individual_metrics.csv"),
            reference_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/04_reference_filtered/reference_filter_metrics.csv"),
            consensus_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/05_cleaned_consensus/cleaned_cons_metrics-concat.csv")
        output:
            completion_flag=os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner_complete.txt"),
            combined_statistics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/combined_statistics.csv")
        params:
            script=os.path.join(workflow.basedir, "scripts/06_aggregate_filter_metrics.py"),
            output_dir=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner")
        log:
            os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner/06_aggregate_metrics.log")
        threads: 2
        resources:
            mem_mb=3076
        retries: 2
        shell:
            """
            set -euo pipefail

            python {params.script} \\
                --human-metrics {input.human_metrics} \\
                --at-metrics {input.at_metrics} \\
                --outlier-metrics {input.outlier_metrics} \\
                --reference-metrics {input.reference_metrics} \\
                --consensus-metrics {input.consensus_metrics} \\
                --output-dir {params.output_dir} \\
                --combined-statistics {output.combined_statistics} \\
                --threads {threads} \\
                2>&1 | tee {log}
                
            touch {output.completion_flag}
            """

else:
    # Version WITHOUT reference filtering
    rule consensus_generation_concat:
        input:
            filtered_files=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/03_outlier_filtered/outlier_filtered.txt")
        output:
            concat_consensus=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/cleaned_cons_combined.fasta"),
            consensus_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/05_cleaned_consensus/cleaned_cons_metrics-concat.csv")
        params:
            script=os.path.join(workflow.basedir, "scripts/05_consensus_generator.py"),
            output_dir=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/05_cleaned_consensus"),
            consensus_threshold=fasta_cleaner_params["consensus_threshold"],
            preprocessing_mode="concat"
        log:
            os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner/05_consensus_generation.log")
        threads: rule_resources["consensus_generation"]["threads"]
        resources:
            mem_mb=lambda wildcards, attempt: rule_resources["consensus_generation"]["mem_mb"] * attempt
        retries: 2
        shell:
            """
            set -euo pipefail

            python {params.script} \\
                --input-files-list {input.filtered_files} \\
                --output-dir {params.output_dir} \\
                --consensus-fasta {output.concat_consensus} \\
                --consensus-metrics {output.consensus_metrics} \\
                --consensus-threshold {params.consensus_threshold} \\
                --preprocessing-mode {params.preprocessing_mode} \\
                --threads {threads} \\
                2>&1 | tee {log}
            """

    rule aggregate_filter_metrics_concat:
        input:
            human_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/01_human_filtered/human_filter_metrics.csv"),
            at_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/02_at_filtered/at_filter_summary.csv"),
            outlier_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/03_outlier_filtered/outlier_filter_individual_metrics.csv"),
            consensus_metrics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/05_cleaned_consensus/cleaned_cons_metrics-concat.csv")
        output:
            completion_flag=os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner_complete.txt"),
            combined_statistics=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/combined_statistics.csv")
        params:
            script=os.path.join(workflow.basedir, "scripts/06_aggregate_filter_metrics.py"),
            output_dir=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner")
        log:
            os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner/06_aggregate_metrics.log")
        threads: 1
        resources:
            mem_mb=3076
        retries: 2
        shell:
            """
            set -euo pipefail

            python {params.script} \\
                --human-metrics {input.human_metrics} \\
                --at-metrics {input.at_metrics} \\
                --outlier-metrics {input.outlier_metrics} \\
                --consensus-metrics {input.consensus_metrics} \\
                --output-dir {params.output_dir} \\
                --combined-statistics {output.combined_statistics} \\
                --threads {threads} \\
                2>&1 | tee {log}
                
            touch {output.completion_flag}
            """
			
# ===== CLEANUP AND CONSENSUS COMBINATION RULES =====

# Clean up intermediate fasta files from filtering directories (remove intermediate FASTA files)
rule remove_fasta_cleaner_files:
    input:
        # Wait for filtering to complete
        merge_cleaning_complete=os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner_complete.txt"),
        concat_cleaning_complete=os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner_complete.txt"),
        # Ensure final consensus sequences are created before cleanup
        merge_consensus=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/cleaned_cons_combined.fasta"),
        concat_consensus=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/cleaned_cons_combined.fasta")
    output:
        merge_cleanup_complete=os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner/fasta_cleaner_complete.txt"),
        concat_cleanup_complete=os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner/fasta_cleaner_complete.txt")
    params:
        merge_dir=barcode_recovery_dir_merge,
        concat_dir=barcode_recovery_dir_concat,
        use_ref_filtering=str(use_reference_filtering)
    threads: 1
    resources:
        mem_mb=2048
    retries: 2
    run:
        def cleanup_fasta_files(mode_dir, mode_name, completion_file):       
            removed_files = 0
            total_size_saved = 0
            
            with open(completion_file, 'w') as log:
                log.write(f"Starting intermediate fasta cleanup for {mode_name} mode: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                log.write(f"Mode directory: {mode_dir}\n")
                log.write(f"Reference filtering enabled: {params.use_ref_filtering}\n\n")
                
                # Define directories to clean
                filter_dirs = [
                    "01_human_filtered",
                    "02_at_filtered", 
                    "03_outlier_filtered"
                ]
                
                # Add reference filtering directory if enabled
                if params.use_ref_filtering == "True":
                    filter_dirs.append("04_reference_filtered")
                    log.write("Including 04_reference_filtered directory in cleanup\n")
                else:
                    log.write("Skipping 04_reference_filtered directory (not enabled)\n")
                
                log.write(f"Directories to clean: {', '.join(filter_dirs)}\n\n")
                
                # Clean each filtering directory
                for filter_dir in filter_dirs:
                    dir_path = os.path.join(mode_dir, "fasta_cleaner", filter_dir)
                    
                    if not os.path.exists(dir_path):
                        log.write(f"[{filter_dir}] Directory not found: {dir_path}\n")
                        continue
                    
                    log.write(f"[{filter_dir}] Cleaning directory: {dir_path}\n")
                    
                    # Find all .fasta and .fas files
                    fasta_patterns = [
                        os.path.join(dir_path, "*.fasta"),
                        os.path.join(dir_path, "*.fas"),
                        os.path.join(dir_path, "**", "*.fasta"),
                        os.path.join(dir_path, "**", "*.fas")
                    ]
                    
                    fasta_files = []
                    for pattern in fasta_patterns:
                        fasta_files.extend(glob.glob(pattern, recursive=True))
                    
                    # Remove duplicates
                    fasta_files = list(set(fasta_files))
                    
                    if not fasta_files:
                        log.write(f"[{filter_dir}] No .fasta or .fas files found\n")
                        continue
                    
                    log.write(f"[{filter_dir}] Found {len(fasta_files)} fasta files to remove:\n")
                    
                    dir_removed = 0
                    dir_size_saved = 0
                    
                    for fasta_file in fasta_files:
                        try:
                            # Get file size before removal
                            file_size = os.path.getsize(fasta_file)
                            
                            # Remove the file
                            os.remove(fasta_file)
                            
                            # Log the removal
                            size_mb = file_size / (1024 * 1024)
                            log.write(f"  - Removed: {os.path.basename(fasta_file)} ({size_mb:.2f} MB)\n")
                            
                            dir_removed += 1
                            dir_size_saved += file_size
                            
                        except OSError as e:
                            log.write(f"  - ERROR removing {fasta_file}: {e}\n")
                    
                    removed_files += dir_removed
                    total_size_saved += dir_size_saved
                    
                    size_saved_mb = dir_size_saved / (1024 * 1024)
                    log.write(f"[{filter_dir}] Removed {dir_removed} files, saved {size_saved_mb:.2f} MB\n\n")
                
                # Summary
                total_saved_mb = total_size_saved / (1024 * 1024)
                log.write(f"CLEANUP SUMMARY for {mode_name} mode:\n")
                log.write(f"  Total files removed: {removed_files}\n")
                log.write(f"  Total disk space saved: {total_saved_mb:.2f} MB\n")
                log.write(f"  Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                
                return removed_files, total_saved_mb
        
        # Clean up merge mode - write directly to completion file
        merge_removed, merge_saved = cleanup_fasta_files(params.merge_dir, "merge", output.merge_cleanup_complete)
        
        # Clean up concat mode - write directly to completion file
        concat_removed, concat_saved = cleanup_fasta_files(params.concat_dir, "concat", output.concat_cleanup_complete)


rule cat_consensus_files:
    input:
        # Pre-cleaning consensus files
        fasta_concat=os.path.join(barcode_recovery_dir_concat, f"consensus/{run_name}_cons_combined-concat.fasta"),
        fasta_merge=os.path.join(barcode_recovery_dir_merge, f"consensus/{run_name}_cons_combined-merge.fasta"),
        # Post-cleaning consensus files
        merge_consensus=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/cleaned_cons_combined.fasta"),
        concat_consensus=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/cleaned_cons_combined.fasta")
    output:
        final_consensus=os.path.join(barcode_recovery_dir, f"{run_name}_all_cons_combined.fasta")
    log:
        os.path.join(barcode_recovery_dir, "logs/cat_consensus_files.log")
    threads: 2
    resources:
        mem_mb=3076
    retries: 2
    shell:
        """
        set -euo pipefail
		                
        echo "Concatenating all (2-4) consensus multi-FASTA files for {run_name}" > {log}
        echo "Started: $(date)" >> {log}

        # Concatenation
        cat {input.fasta_concat} {input.fasta_merge} {input.merge_consensus} {input.concat_consensus} > {output.final_consensus}

        echo "Completed: $(date)" >> {log}
        echo "Output file size: $(numfmt --to=iec $(stat -c%s {output.final_consensus}))" >> {log}
        """
		
rule barcode_consensus_count:
    input:
        all_cons_combined=os.path.join(barcode_recovery_dir, f"{run_name}_all_cons_combined.fasta")
    output:
        tsv_barcode_count=os.path.join(barcode_recovery_dir, "barcode_consensus_count.tsv"),
        log_barcode_count=os.path.join(barcode_recovery_dir, "logs/barcode_consensus_count.log")
    params:
        script=os.path.join(workflow.basedir, "scripts/barcode_consensus_count.py"),
        outlier_percentile=fasta_cleaner_params["outlier_percentile"],
        consensus_threshold=fasta_cleaner_params["consensus_threshold"]
    threads: 1
    resources:
        mem_mb=1024
    retries: 3
    shell:
        """
        set -euo pipefail
        python {params.script} \\
            --input {input.all_cons_combined} \\
            --tsv {output.tsv_barcode_count} \\
            --log {output.log_barcode_count}
        """
		
		
# ===== VALIDATION RULES =====

if run_structural_validation:
    rule structural_validation:
        input:
            fasta_concat=os.path.join(barcode_recovery_dir_concat, f"consensus/{run_name}_cons_combined-concat.fasta"),
            fasta_merge=os.path.join(barcode_recovery_dir_merge, f"consensus/{run_name}_cons_combined-merge.fasta"),
            clean_fasta_concat=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/cleaned_cons_combined.fasta"),
            clean_fasta_merge=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/cleaned_cons_combined.fasta"),
            stats_concat=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/combined_statistics.csv"),
            stats_merge=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/combined_statistics.csv"),
            hmm_file=hmm_file_path
        output:
            csv=os.path.join(main_output_dir, f"04_barcode_validation/structural/structural_validation.csv"),
            all_passing_barcode=os.path.join(main_output_dir, f"04_barcode_validation/structural/output_barcode_all_passing.fasta")
        params:
            script=os.path.join(workflow.basedir, "scripts/structural_validation.py"),
            target_marker=structural_validation_params["target"],
            genetic_code=structural_validation_params.get("genetic_code", 1),  # Default to standard genetic code
            verbosity=" ".join(["--verbose" if structural_validation_params["verbose"] else ""]).strip()
        log:
            os.path.join(main_output_dir, f"04_barcode_validation/logs/structural_validation.log")
        threads: rule_resources["structural_validation"]["threads"]
        resources:
            mem_mb=lambda wildcards, attempt: rule_resources["structural_validation"]["mem_mb"] * attempt,
            slurm_partition=rule_resources["structural_validation"]["partition"]
        retries: 3
        shell:
            """
            set -euo pipefail

            # Ensure HMM file exists
            if [ ! -f "{input.hmm_file}" ]; then
                echo "ERROR: HMM file not found: {input.hmm_file}" >> {log}
                echo "Please ensure the HMM file for target '{params.target_marker}' exists at the specified path." >> {log}
                exit 1
            fi

            python {params.script} \
                --output-csv {output.csv} \
                --input {input.fasta_concat} {input.fasta_merge} {input.clean_fasta_concat} {input.clean_fasta_merge} \
                --hmm {input.hmm_file} \
                --code {params.genetic_code} \
                --log-file {log} \
                --disable-selection \
                {params.verbosity}
            """

if run_taxonomic_validation:
    rule local_blast:
        input:
            barcode_fasta=os.path.join(main_output_dir, f"04_barcode_validation/structural/output_barcode_all_passing.fasta")
        output:
            csv=os.path.join(main_output_dir, f"04_barcode_validation/taxonomic/metrics/01_local_blast_output.csv")
        params:
            script=os.path.join(workflow.basedir, "scripts/tv_local_blast.py"),
            database=taxonomic_validation_params["database"],
            verbosity="--verbose" if taxonomic_validation_params["verbose"] else "",
            out_dir=os.path.join(main_output_dir, f"04_barcode_validation/taxonomic/") 
        log:
            os.path.join(main_output_dir, f"04_barcode_validation/logs/01_local_blast.log")
        threads: rule_resources["taxonomic_validation"]["threads"]
        resources:
            mem_mb=lambda wildcards, attempt: rule_resources["taxonomic_validation"]["mem_mb"] * attempt,
            slurm_partition=rule_resources["taxonomic_validation"]["partition"]
        retries: 3
        shell:
            """
            set -euo pipefail
            
            # Create output directory
            mkdir -p {params.out_dir}
            
            # Check if database exists
            if [ ! -e "{params.database}" ]; then
                echo "ERROR: Database not found: {params.database}" >> {log}
                echo "Please ensure the database path exists and contains BLAST database files or a FASTA file." >> {log}
                exit 1
            fi

            echo "Starting taxonomic validation: $(date)" > {log}
            echo "Input FASTA: {input.barcode_fasta}" >> {log}
            echo "Database: {params.database}" >> {log}
            echo "Output directory: {params.out_dir}" >> {log}
            echo "Threads: {threads}" >> {log}
            echo "Verbose: {params.verbosity}" >> {log}

            python {params.script} \\
                --input {input.barcode_fasta} \\
                --database {params.database} \\
                --output {params.out_dir} \\
                --output-csv {output.csv} \\
                --processes {threads} \\
                {params.verbosity} \\
                >> {log} 2>&1

            echo "Taxonomic validation completed: $(date)" >> {log}
            """

if run_taxonomic_validation:
    rule blast2taxonomy:
        input:
            blast_csv=os.path.join(main_output_dir, f"04_barcode_validation/taxonomic/metrics/01_local_blast_output.csv"),
            blast_taxonomy=taxonomic_validation_params["database_taxonomy"],
            expected_taxonomy=taxonomic_validation_params["expected_taxonomy"],
            input_fasta=os.path.join(main_output_dir, f"04_barcode_validation/structural/output_barcode_all_passing.fasta")
        output:
            taxonomy_csv=os.path.join(main_output_dir, f"04_barcode_validation/taxonomic/metrics/02_taxonomic_validation.csv"),
            barcode_fasta=os.path.join(main_output_dir, f"04_barcode_validation/taxonomic/validated_barcodes.fasta"),
            final_fasta=os.path.join(main_output_dir, f"04_barcode_validation/{run_name}_validated_barcodes.fasta")
        params:
            script=os.path.join(workflow.basedir, "scripts/tv_blast2taxonomy.py"),
            taxval_rank=taxonomic_validation_params.get("taxval_rank", "family"),
            min_pident=taxonomic_validation_params.get("min_pident", "80"),
            min_length=taxonomic_validation_params.get("min_length", "100")
        log:
            os.path.join(main_output_dir, f"04_barcode_validation/logs/02_taxonomic_validation.log")
        threads: rule_resources["blast2taxonomy"]["threads"]
        resources:
            mem_mb=lambda wildcards, attempt: rule_resources["blast2taxonomy"]["mem_mb"] * attempt
        retries: 3
        shell:
            """
            set -euo pipefail

            echo "Starting taxonomy validation analysis: $(date)" > {log}
            echo "BLAST CSV: {input.blast_csv}" >> {log}
            echo "BLAST taxonomy: {input.blast_taxonomy}" >> {log}
            echo "Expected taxonomy: {input.expected_taxonomy}" >> {log}
            echo "Input FASTA: {input.input_fasta}" >> {log}
            echo "Validation rank: {params.taxval_rank}" >> {log}
            echo "Output CSV: {output.taxonomy_csv}" >> {log}
            echo "Output FASTA: {output.barcode_fasta}" >> {log}

            python {params.script} \\
                --taxval-rank {params.taxval_rank} \\
                --input-blast-csv {input.blast_csv} \\
                --input-taxonomy-file {input.blast_taxonomy} \\
                --input-exp-taxonomy {input.expected_taxonomy} \\
                --input-fasta {input.input_fasta} \\
                --output-csv {output.taxonomy_csv} \\
                --output-fasta {output.barcode_fasta} \\
                --min-pident {params.min_pident} \\
                --min-length {params.min_length} \\
                --log {log} \\
                >> {log} 2>&1

            # Copy and rename the barcode_fasta to another directory
            cp {output.barcode_fasta} {output.final_fasta}

            echo "Taxonomy validation analysis completed: $(date)" >> {log}
            """
			
# ===== STATISTICS AND FINAL RULES =====

# Extract, aggregate and calculate stats from inputs for merge mode
rule extract_stats_to_csv_merge:
    input:
        alignment_log=os.path.join(barcode_recovery_dir_merge, "logs/mge/alignment_files.log"),
        out_files=expand(
            os.path.join(barcode_recovery_dir_merge, "out/{sample}_r_{r}_s_{s}_summary.out"),
            sample=list(samples.keys()),
            r=config["r"],
            s=config["s"]
        ),
        pre_fasta=os.path.join(barcode_recovery_dir_merge, f"consensus/{run_name}_cons_combined-merge.fasta"),
        cleaning_csv=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/combined_statistics.csv"),
        cleaner_complete=os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner_complete.txt"),
        post_fasta=os.path.join(barcode_recovery_dir_merge, "fasta_cleaner/cleaned_cons_combined.fasta"),
        ref_seqs=os.path.join(main_output_dir, "02_references/sequence_references.csv") if run_gene_fetch else []
    output:
        stats=os.path.join(barcode_recovery_dir_merge, f"{run_name}_merge-stats.csv")
    params:
        script=os.path.join(workflow.basedir, "scripts/mge_stats.py"),
        out_dir=os.path.join(barcode_recovery_dir_merge, "out/"),
        log_dir=os.path.join(barcode_recovery_dir_merge, "logs/mge"),
        ref_seqs_arg=lambda wildcards, input: f"--ref_seqs {input.ref_seqs}" if run_gene_fetch else ""
    threads: rule_resources["extract_stats_to_csv"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["extract_stats_to_csv"]["mem_mb"] * attempt
    retries: 3
    shell:
        """
        set -euo pipefail
        # Run script
        python {params.script} -a {input.alignment_log} -o {output.stats} -od {params.out_dir} -c {input.cleaning_csv} {params.ref_seqs_arg} --pre-fasta {input.pre_fasta} --post-fasta {input.post_fasta}
        # Relocate logs
        mv mge_stats.log {params.log_dir}/
        """

rule extract_stats_to_csv_concat:
    input:
        alignment_log=os.path.join(barcode_recovery_dir_concat, "logs/mge/alignment_files.log"),
        out_files=expand(
            os.path.join(barcode_recovery_dir_concat, "out/{sample}_r_{r}_s_{s}_summary.out"),
            sample=list(samples.keys()),
            r=config["r"],
            s=config["s"]
        ),
        pre_fasta=os.path.join(barcode_recovery_dir_concat, f"consensus/{run_name}_cons_combined-concat.fasta"),
        cleaning_csv=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/combined_statistics.csv"),
        cleaner_complete=os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner_complete.txt"),
        post_fasta=os.path.join(barcode_recovery_dir_concat, "fasta_cleaner/cleaned_cons_combined.fasta"),
        ref_seqs=os.path.join(main_output_dir, "02_references/sequence_references.csv") if run_gene_fetch else []
    output:
        stats=os.path.join(barcode_recovery_dir_concat, f"{run_name}_concat-stats.csv")
    params:
        script=os.path.join(workflow.basedir, "scripts/mge_stats.py"),
        out_dir=os.path.join(barcode_recovery_dir_concat, "out/"),
        log_dir=os.path.join(barcode_recovery_dir_concat, "logs/mge"),
        ref_seqs_arg=lambda wildcards, input: f"--ref_seqs {input.ref_seqs}" if run_gene_fetch else ""
    threads: rule_resources["extract_stats_to_csv"]["threads"]
    resources:
        mem_mb=lambda wildcards, attempt: rule_resources["extract_stats_to_csv"]["mem_mb"] * attempt
    retries: 3
    shell:
        """
        set -euo pipefail
        # Run script
        python {params.script} -a {input.alignment_log} -o {output.stats} -od {params.out_dir} -c {input.cleaning_csv} {params.ref_seqs_arg} --pre-fasta {input.pre_fasta} --post-fasta {input.post_fasta}
        # Relocate logs
        mv mge_stats.log {params.log_dir}/
        """

# Concatenate merge and concat stats CSV files with column alignment
rule combine_stats_files:
    input:
        merge_stats=os.path.join(barcode_recovery_dir_merge, f"{run_name}_merge-stats.csv"),
        concat_stats=os.path.join(barcode_recovery_dir_concat, f"{run_name}_concat-stats.csv")
    output:
        combined_stats=os.path.join(barcode_recovery_dir, f"{run_name}_barcode_recovery_metrics.csv")
    params:
        script=os.path.join(workflow.basedir, "scripts/csv_combiner_mge.py")
    threads: 2
    resources:
        mem_mb=3076
    retries: 3
    shell:
        """
        set -euo pipefail

        python {params.script} \
            --input {input.merge_stats} {input.concat_stats} \
            --output {output.combined_stats}
        """	

rule val_barcode_recovery_metrics_merge:
    input:
        structval=os.path.join(main_output_dir, f"04_barcode_validation/structural/structural_validation.csv"),
        taxval=os.path.join(main_output_dir, f"04_barcode_validation/taxonomic/metrics/02_taxonomic_validation.csv"),
        beegees_csv=os.path.join(barcode_recovery_dir, f"{run_name}_barcode_recovery_metrics.csv")
    output:
        final_metrics=os.path.join(main_output_dir, f"{run_name}_final_metrics.csv")
    params:
        script=os.path.join(workflow.basedir, "scripts/val_csv_merger.py")
    threads: 1
    resources:
        mem_mb=3076
    retries: 3
    shell:
        """
        set -euo pipefail

        python {params.script} \
            --structval-csv {input.structval} \
            --taxval-csv {input.taxval} \
            --beegees-csv {input.beegees_csv} \
            --output {output.final_metrics}
        """	

if run_structural_validation and run_taxonomic_validation:
    rule barcoding_outcome:
        input:
            final_metrics=os.path.join(main_output_dir, f"{run_name}_final_metrics.csv")
        output:
            outcome_tsv=os.path.join(main_output_dir, "05_barcoding_outcome/barcoding_outcome.tsv")
        params:
            script=os.path.join(workflow.basedir, "scripts/barcoding_outcome.py")
        log:
            os.path.join(main_output_dir, "05_barcoding_outcome/barcoding_outcome.log")
        threads: 1
        resources:
            mem_mb=4096
        retries: 2
        shell:
            """
            set -euo pipefail
            
            mkdir -p $(dirname {output.outcome_tsv})
            
            python {params.script} \
                --input {input.final_metrics} \
                --output {output.outcome_tsv} \
                --log {log}
            """
			
# Final clean up superfluous files
rule cleanup_files:
    input:
        # Merge mode inputs
        merge_summary_csv=os.path.join(barcode_recovery_dir_merge, f"{run_name}_merge-stats.csv"),
        clean_headers_log=os.path.join(preprocessing_dir_merge, "logs/clean_headers/clean_headers.log"),
        merge_exon_remove=os.path.join(barcode_recovery_dir_merge, "logs/exonerate_int_cleanup_complete.txt"),
        # Concat mode inputs
        concat_summary_csv=os.path.join(barcode_recovery_dir_concat, f"{run_name}_concat-stats.csv"),
        concat_logs=os.path.join(preprocessing_dir_concat, "logs/concat/concat_reads.log"),
        trim_galore_logs=os.path.join(preprocessing_dir_concat, "logs/trim_galore/trim_galore.log"),
        concat_exon_remove=os.path.join(barcode_recovery_dir_concat, "logs/exonerate_int_cleanup_complete.txt"),
        # Wait for fasta cleanup to complete
        merge_fasta_cleanup=os.path.join(barcode_recovery_dir_merge, "logs/fasta_cleaner/fasta_cleaner_complete.txt"),
        concat_fasta_cleanup=os.path.join(barcode_recovery_dir_concat, "logs/fasta_cleaner/fasta_cleaner_complete.txt")
    output:
        merge_complete=touch(os.path.join(preprocessing_dir_merge, "logs/final_cleanup_complete.txt")),
        concat_complete=touch(os.path.join(preprocessing_dir_concat, "logs/final_cleanup_complete.txt"))
    threads: 1
    resources:
        mem_mb=3072
    retries: 2
    run:      
        # Function to clean up a specific mode's files
        def cleanup_mode_files(mode_dir, mode_name):
            removed_files = 0
            removed_dirs = []
            cleaned_dirs = []
            
            # Remove empty .log files in mge subdir - both modes
            mge_log_dir = os.path.join(mode_dir, "logs/mge")
            if os.path.exists(mge_log_dir):
                # Find all .log files in subdirectories of mge/
                mge_log_files = glob.glob(os.path.join(mge_log_dir, "**", "*.log"), recursive=True)
                removed_empty_logs = 0
                skipped_nonempty_logs = 0
                
                for mge_log_file in mge_log_files:
                    # Check if the log file is empty (0 bytes)
                    if os.path.getsize(mge_log_file) == 0:
                        print(f"[{mode_name}] Removing empty MGE log file: {mge_log_file}")
                        os.remove(mge_log_file)
                        removed_empty_logs += 1
                    else:
                        print(f"[{mode_name}] Keeping non-empty MGE log file: {mge_log_file} ({os.path.getsize(mge_log_file)} bytes)")
                        skipped_nonempty_logs += 1
                
                print(f"[{mode_name}] Removed {removed_empty_logs} empty MGE log files")
                print(f"[{mode_name}] Kept {skipped_nonempty_logs} non-empty MGE log files")
                removed_files += removed_empty_logs
                
                cleaned_dirs.append(mge_log_dir)
            else:
                print(f"[{mode_name}] MGE log directory not found: {mge_log_dir}")
            
            # Remove rename_complete.txt - both modes
            rename_complete_file = os.path.join(mode_dir, "logs/rename_consensus/rename_complete.txt")
            if os.path.exists(rename_complete_file):
                print(f"[{mode_name}] Removing rename_complete.txt file: {rename_complete_file}")
                os.remove(rename_complete_file)
                removed_files += 1
            else:
                print(f"[{mode_name}] rename_complete.txt file not found: {rename_complete_file}")
            
            # Return cleanup statistics
            return {
                "removed_files": removed_files,
                "removed_dirs": removed_dirs,
                "cleaned_dirs": cleaned_dirs
            }
        
        # Cleanup for merge mode (using barcode_recovery_dir for MGE logs)
        merge_stats = cleanup_mode_files(barcode_recovery_dir_merge, "merge")
        
        # Cleanup for concat mode (using barcode_recovery_dir for MGE logs)
        concat_stats = cleanup_mode_files(barcode_recovery_dir_concat, "concat")
        
        # Write completion message and cleanup info to merge output file
        with open(output.merge_complete, 'w') as f:
            f.write("Cleanup complete at " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n\n")
            f.write(f"Total log files removed: {merge_stats['removed_files']}\n")            
            f.write("Directories fully removed:\n")
            if merge_stats['removed_dirs']:
                for dir_path in merge_stats['removed_dirs']:
                    f.write(f"- {dir_path}\n")
            else:
                f.write("- No directories were fully removed\n")
            
            f.write("\nDirectories cleaned of log files:\n")
            if merge_stats['cleaned_dirs']:
                for dir_path in merge_stats['cleaned_dirs']:
                    f.write(f"- {dir_path}\n")
            else:
                f.write("- No directories were cleaned\n")
            
            f.write("\nPreprocessing mode: merge")
        
        # Write completion message and cleanup info to concat output file
        with open(output.concat_complete, 'w') as f:
            f.write("Cleanup complete at " + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n\n")
            
            f.write(f"Total log files removed: {concat_stats['removed_files']}\n")
            
            f.write("Directories fully removed:\n")
            if concat_stats['removed_dirs']:
                for dir_path in concat_stats['removed_dirs']:
                    f.write(f"- {dir_path}\n")
            else:
                f.write("- No directories were fully removed\n")
            
            f.write("\nDirectories cleaned of log files:\n")
            if concat_stats['cleaned_dirs']:
                for dir_path in concat_stats['cleaned_dirs']:
                    f.write(f"- {dir_path}\n")
            else:
                f.write("- No directories were cleaned\n")
            
            f.write("\nPreprocessing mode: concat")
