Source code for homologyviz.gb_files_manipulation

"""
Utilities for processing GenBank files and BLASTn results in HomologyViz.

This module provides functions to:
    - Convert GenBank (.gb) files to FASTA format for BLASTn (`make_fasta_files`).
    - Run local BLASTn alignments and capture XML results (`run_blastn`,
      `blastn_command_line`).
    - Parse BLASTn XML into structured DataFrames (`get_blast_metadata`,
      `parse_blast_record`).
    - Extract sequence- and feature-level metadata from GenBank records
      (`genbank_files_metadata_to_dataframes`, `parse_genbank_cds_to_df`).
    - Determine the longest sequence and homology bounds for plotting
      (`get_longest_sequence_dataframe`, `find_lowest_and_highest_homology_dataframe`).

These utilities underpin the data preparation pipeline for visualizing homology and gene
annotations.

Notes
-----
- This file is part of HomologyViz
- BSD 3-Clause License
- Copyright (c) 2024, Iván Muñoz Gutiérrez
"""

from pathlib import Path
import subprocess
import pandas as pd
from pandas import DataFrame

from Bio import SeqIO
from Bio.Blast import NCBIXML
from Bio.SeqRecord import SeqRecord
from Bio.Blast import Record



[docs]
def make_fasta_files(gb_files: list[Path], output_path: Path) -> list[Path]:
    """
    Convert GenBank files to FASTA format for downstream processing (e.g., BLASTn).

    Each input GenBank file is parsed to extract its sequence and metadata,
    and a corresponding FASTA (.faa) file is written to the specified output folder.

    Parameters
    ----------
    gb_files : list of pathlib.Path
        List of paths to GenBank (.gb) files to be converted.
    output_path : pathlib.Path
        Directory where the resulting FASTA files will be saved.

    Returns
    -------
    list of pathlib.Path
        List of paths to the generated FASTA (.faa) files.
    """
    # Initiate list to store paths to fasta files.
    faa_files = []
    # Iterate over paths of gb files.
    for gb_file in gb_files:
        # Read gb files and make a new record
        record = SeqIO.read(gb_file, "genbank")
        new_record = SeqRecord(record.seq, id=record.id, description=record.description)
        # Get name of gb file without extension
        name = gb_file.name.split(".")[0]
        faa_name = name + ".faa"
        # Make otuput path
        output_file = output_path / faa_name
        # Create fasta file
        SeqIO.write(new_record, output_file, "fasta")
        # Append path of fasta file to faa_files list.
        faa_files.append(output_file)
    return faa_files




[docs]
def run_blastn(faa_files: list[Path], output_path: Path) -> list[Path]:
    """
    Run local BLASTn alignments between consecutive FASTA files and save results in XML
    format.

    For a given list of FASTA files, this function performs pairwise comparisons in order:
    file[0] vs file[1], file[1] vs file[2], and so on. The results are saved as XML files
    using BLAST output format 5.

    Parameters
    ----------
    faa_files : list of pathlib.Path
        List of paths to nucleotide FASTA (.faa) files to be compared.
    output_path : pathlib.Path
        Directory where the resulting BLASTn XML output files will be saved.

    Returns
    -------
    list of pathlib.Path
        List of paths to the BLASTn result files in XML format.
    """
    # TODO: Consider logging to file instead of printing directly

    # Initiate list to store paths to xml results.
    results = []
    # Iterate over paths of fasta files.
    for i in range(len(faa_files) - 1):
        # Make path to outpu file
        output_file_name = "result" + str(i) + ".xml"
        output_file = output_path / output_file_name
        # Run blastn
        std = blastn_command_line(
            query=faa_files[i], subject=faa_files[i + 1], outfmt=5, out=output_file
        )
        # Append path to xlm results to the result list
        results.append(output_file)
        print(f"BLASTing {faa_files[i]} (query) and {faa_files[i+1]} (subject)\n")
        print(std)
    return results




[docs]
def blastn_command_line(query: Path, subject: Path, out: Path, outfmt: int = 5) -> str:
    """
    Run a local BLASTn alignment between two nucleotide sequences using the command line.

    Executes BLASTn with the given query and subject FASTA files, writes results to the
    specified output file, and returns the standard output or error message.

    Parameters
    ----------
    query : pathlib.Path
        Path to the query FASTA file.
    subject : pathlib.Path
        Path to the subject FASTA file.
    out : pathlib.Path
        Path to the file where BLASTn output will be written.
    outfmt : int, default=5
        BLAST output format (5 = XML). HomologyViz requires XML format for parsing.

    Returns
    -------
    str
        The standard output from the BLASTn command if successful, or the error message
        if the command fails.

    Notes
    -----
    - Both `query` and `subject` must be valid nucleotide FASTA files.
    - The default output format (XML) is required for compatibility with HomologyViz.
    """
    # TODO: Add a raise instead of print() for critical failures if you're building a CLI
    # or GUI around this.

    # Define the BLASTn command
    command = [
        "blastn",
        "-query",
        str(query),
        "-subject",
        str(subject),
        "-outfmt",
        str(outfmt),
        "-out",
        str(out),
    ]

    # Run BLASTn
    try:
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        return result.stdout
    except subprocess.CalledProcessError as e:
        print("Error running BLASTn:", e)
        return e.stderr




[docs]
def genbank_files_metadata_to_dataframes(
    gb_files: list[Path],
) -> tuple[DataFrame, DataFrame]:
    """
    Parse GenBank files and return sequence and CDS metadata as structured DataFrames.

    This function reads a list of GenBank files and extracts relevant metadata for
    downstream plotting or analysis. It separates the data into two related tables:
    one for general sequence information (`gb_df`) and one for coding sequences
    (`cds_df`).

    Parameters
    ----------
    gb_files : list of pathlib.Path
        List of paths to GenBank (.gb) files.

    Returns
    -------
    gb_df : pandas.DataFrame
        DataFrame with GenBank record-level metadata, including:
            - file number, file path, file name, custom_name, record name, accession,
              sequence length,
              and plotting coordinates (`sequence_start`, `sequence_end`).

    cds_df : pandas.DataFrame
        DataFrame with CDS (gene) feature metadata from all GenBank files, including:
            - file number, accession, gene name, product name, strand, color (if
              available), and plotting coordinates (`start_plot`, `end_plot`).

    Notes
    -----
    These two DataFrames are linked via the `file_number` and `accession` fields.
    """
    # headers gb_files_df
    headers_gb_files_df = [
        "file_number",
        "file_path",
        "file_name",
        "custom_name",
        "record_name",
        "accession",
        "length",
        "sequence_start",
        "sequence_end",
    ]
    # Initiate dictionary to store data
    gb_files_data = dict(
        file_number=[],
        file_path=[],
        file_name=[],
        custom_name=[],
        record_name=[],
        accession=[],
        length=[],
        sequence_start=[],
        sequence_end=[],
    )
    # Initiate a list of cds DataFrames
    cds_dataframes = []

    # Iterate over GenBank files
    for i, gb_file in enumerate(gb_files):
        # fill data related to the file
        gb_files_data["file_number"].append(i)
        gb_files_data["file_path"].append(gb_file)
        gb_files_data["file_name"].append(gb_file.stem)

        # Add an empty string to custom name for future manipulation in the GUI
        gb_files_data["custom_name"].append("")

        # Read the file into a temporary variable
        record = SeqIO.read(gb_file, "genbank")

        # fill data related to the GenBank record
        gb_files_data["record_name"].append(record.name)
        gb_files_data["accession"].append(record.id)
        seq_length = len(record)
        gb_files_data["length"].append(float(seq_length))
        gb_files_data["sequence_start"].append(0.0)
        gb_files_data["sequence_end"].append(float(seq_length))

        # Get a DataFrame from the cds
        cds_dataframes.append(
            parse_genbank_cds_to_df(record=record, file_number=i, accession=record.id)
        )

    # Create the GenBank files DataFrame
    gb_df = DataFrame(gb_files_data, columns=headers_gb_files_df)
    # Concatenate the cds_dataframes list into a single DataFrame
    cds_df = pd.concat(cds_dataframes, ignore_index=True)

    return gb_df, cds_df




[docs]
def parse_genbank_cds_to_df(
    record: SeqRecord, file_number: int, accession: str
) -> DataFrame:
    """
    Extract CDS feature metadata from a GenBank record and return it as a DataFrame.

    This function parses a `Bio.SeqRecord` GenBank object and compiles information
    from all its `CDS` features.

    Parameters
    ----------
    record : Bio.SeqRecord.SeqRecord
        The parsed GenBank record to extract CDS data from.
    file_number : int
        Index of the GenBank file, used for relational tracking.
    accession : str
        Accession ID of the sequence, used for relational grouping.

    Returns
    -------
    pandas.DataFrame
        A DataFrame with one row per CDS part, containing the following columns:
        - file_number: index of source file
        - cds_number: index of the CDS feature within the record
        - accession: accession ID of the sequence
        - gene: gene name (if available)
        - product: protein product name (if available)
        - start, end: sequence coordinates (1-based)
        - strand: strand orientation (+1 or -1)
        - color: gene color (from `/Color` qualifier, or default "#ffff00")
        - start_plot, end_plot: adjusted coordinates for plotting purposes

    Notes
    -----
    - Each part of a multi-segment CDS is treated as a separate row.
    - Start/end coordinates are stored as floats for consistency with plotting tools.
    """
    # DataFrame headers
    headers = [
        "file_number",
        "cds_number",
        "accession",
        "gene",
        "product",
        "start",
        "end",
        "strand",
        "color",
        "start_plot",
        "end_plot",
    ]
    # Initiate dictionary to store data
    data = dict(
        file_number=[],
        cds_number=[],
        accession=[],
        gene=[],
        product=[],
        start=[],
        end=[],
        strand=[],
        color=[],
        start_plot=[],
        end_plot=[],
    )
    # Initialize counter to track cds; enumerate will not give continues numbers.
    counter = 0
    # Iterate over features to extract data. Make sure that if there is no metadata,
    # then add None.
    for feature in record.features:
        if feature.type != "CDS":
            continue
        data["file_number"].append(file_number)
        data["cds_number"].append(counter)
        counter += 1
        data["accession"].append(accession)
        if gene := feature.qualifiers.get("gene", None):
            data["gene"].append(gene[0])
        else:
            data["gene"].append(None)
        if product := feature.qualifiers.get("product", None):
            data["product"].append(product[0])
        else:
            data["product"].append(None)
        if feature.qualifiers.get("Color", None):
            data["color"].append(feature.qualifiers["Color"][0])
        else:
            data["color"].append("#ffff00")  # Make yellow default color
        # Some CDS are composed of more than one parts, like introns, or,
        # in the case of some bacteria, some genes have frameshifts as a
        # regulatory function (some transposase genes have frameshifts as
        # a regulatory function).
        for part in feature.location.parts:
            strand = part._strand
            data["strand"].append(strand)
            if strand == -1:
                data["start"].append(float(part._end))
                data["start_plot"].append(float(part._end))
                data["end"].append(float(part._start + 1))
                data["end_plot"].append(float(part._start + 1))
            else:
                data["start"].append(float(part._start + 1))
                data["start_plot"].append(float(part._start + 1))
                data["end"].append(float(part._end))
                data["end_plot"].append(float(part._end))
    # Create DataFrame
    df = DataFrame(data, columns=headers)
    return df




[docs]
def get_blast_metadata(
    xml_alignment_result: list[Path],
) -> tuple[DataFrame, DataFrame]:
    """
    Parse BLASTn XML result files into structured Pandas DataFrames.

    This function extracts both summary and detailed region metadata from a list of
    BLASTn XML result files, returning two linked DataFrames:
    - `alignments_df`: High-level metadata for each BLAST alignment.
    - `regions_df`: Local matching regions for each alignment.

    Parameters
    ----------
    xml_alignment_result : list of pathlib.Path
        List of paths to XML-formatted BLASTn result files (outfmt=5).

    Returns
    -------
    alignments_df : pandas.DataFrame
        Summary table with one row per alignment. Columns include:
            - alignment_number (int): Unique alignment index
            - query_name (str): Query sequence ID
            - hit_name (str): Subject sequence ID
            - query_len (int): Query sequence length
            - hit_len (int): Subject sequence length

    regions_df : pandas.DataFrame
        Detailed region-level metadata for all matching regions across alignments.
        Includes start/end positions and identity metrics. Each row corresponds to
        one HSP (high-scoring pair). The `alignment_number` field links this table
        to `alignments_df`.

    Notes
    -----
    For additional region-level metadata, see the `parse_blast_record` function.
    """
    headers = ["alignment_number", "query_name", "hit_name", "query_len", "hit_len"]
    data = dict(
        alignment_number=[],
        query_name=[],
        hit_name=[],
        query_len=[],
        hit_len=[],
    )
    regions = []
    # Iterate over xml files containing alignment results
    for i, xml_file in enumerate(xml_alignment_result):
        with open(xml_file, "r") as result_handle:
            blast_record = NCBIXML.read(result_handle)
            # Add alignment number for a relational database
            data["alignment_number"].append(i)
            # Get metadata
            data["query_name"].append(blast_record.query)
            data["hit_name"].append(blast_record.alignments[0].hit_def)
            data["query_len"].append(int(blast_record.query_length))
            data["hit_len"].append(int(blast_record.alignments[0].length))
            regions.append(
                parse_blast_record(blast_record=blast_record, alignment_number=i)
            )
    # Create DataFrame
    alignments_df = DataFrame(data, columns=headers)
    regions_df = pd.concat(regions, ignore_index=True)
    return alignments_df, regions_df




[docs]
def parse_blast_record(blast_record: Record, alignment_number: int) -> DataFrame:
    """
    Parse a BLAST record and extract metadata for all matching regions (HSPs).

    This function processes the first alignment in a BLAST record and extracts key
    information about each high-scoring pair (HSP), including coordinate ranges,
    identity metrics, and computed homology. It returns a DataFrame with one row
    per region, ready for downstream plotting or filtering.

    Parameters
    ----------
    blast_record : Bio.Blast.Record
        A parsed BLAST record object from Bio.Blast.NCBIXML.read().
        Must contain at least one alignment with HSPs.
    alignment_number : int
        Unique index for this alignment, used to link with the summary DataFrame.

    Returns
    -------
    pandas.DataFrame
        DataFrame where each row represents a BLAST high-scoring pair (HSP).
        Includes both raw start/end coordinates and pre-scaled values for plotting.

        Columns:
        - alignment_number : int
        - query_from, query_to : float
        - query_from_plot, query_to_plot : float
        - hit_from, hit_to : float
        - hit_from_plot, hit_to_plot : float
        - identity : int (number of identical matches)
        - positive : int (number of positive-scoring matches)
        - align_len : int (alignment length)
        - homology : float (identity / alignment length)
    """
    headers = [
        "alignment_number",
        "query_from",
        "query_to",
        "query_from_plot",
        "query_to_plot",
        "hit_from",
        "hit_to",
        "hit_from_plot",
        "hit_to_plot",
        "identity",
        "positive",
        "align_len",
        "homology",
    ]
    data = dict(
        alignment_number=[],
        query_from=[],
        query_to=[],
        query_from_plot=[],
        query_to_plot=[],
        hit_from=[],
        hit_to=[],
        hit_from_plot=[],
        hit_to_plot=[],
        identity=[],
        positive=[],
        align_len=[],
        homology=[],
    )
    for region in blast_record.alignments[0].hsps:
        data["alignment_number"].append(alignment_number)
        data["query_from"].append(float(region.query_start))
        data["query_to"].append(float(region.query_end))
        data["query_from_plot"].append(float(region.query_start))
        data["query_to_plot"].append(float(region.query_end))
        data["hit_from"].append(float(region.sbjct_start))
        data["hit_to"].append(float(region.sbjct_end))
        data["hit_from_plot"].append(float(region.sbjct_start))
        data["hit_to_plot"].append(float(region.sbjct_end))
        data["identity"].append(int(region.identities))
        data["positive"].append(int(region.positives))
        data["align_len"].append(int(region.align_length))
        homology = int(region.identities) / int(region.align_length)
        data["homology"].append(homology)
    regions_df = pd.DataFrame(data, columns=headers)
    return regions_df




[docs]
def get_longest_sequence_dataframe(gb_records: DataFrame) -> int:
    """
    Return the length of the longest sequence from the GenBank metadata DataFrame.

    Parameters
    ----------
    gb_records : pandas.DataFrame
        DataFrame containing GenBank metadata. Must include a 'length' column.

    Returns
    -------
    int
        The length (in base pairs) of the longest sequence in the dataset.
    """
    longest = gb_records["length"].max()
    return longest




[docs]
def find_lowest_and_highest_homology_dataframe(regions_df: DataFrame) -> tuple:
    """
    Compute the minimum and maximum homology values from the regions DataFrame.

    Parameters
    ----------
    regions_df : pandas.DataFrame
        DataFrame containing homology region metadata. Must include a 'homology' column
        with float values between 0 and 1.

    Returns
    -------
    lowest : float
        The smallest homology value in `regions_df`.
    highest : float
        The largest homology value in `regions_df`.
    """
    lowest = regions_df["homology"].min()
    highest = regions_df["homology"].max()
    return lowest, highest




[docs]
def adjust_positions_sequences_df_left(gb_records: DataFrame, cds: DataFrame) -> None:
    """
    Align all sequences and CDS features to the left (start at 0) for plotting.

    This function updates the `sequence_start` and `sequence_end` columns in the
    GenBank metadata DataFrame (`gb_records`), and also resets the CDS plotting
    coordinates (`start_plot`, `end_plot`) to match their original start and end positions.

    Parameters
    ----------
    gb_records : pandas.DataFrame
        DataFrame containing metadata for GenBank sequences. Must include 'length',
        'sequence_start', and 'sequence_end' columns.

    cds : pandas.DataFrame
        DataFrame containing CDS feature metadata. Must include 'start', 'end',
        'start_plot', and 'end_plot' columns.
    """
    # Reset the values of gb_records and cds to the left
    gb_records["sequence_start"] = 0.0
    gb_records["sequence_end"] = gb_records["length"]
    cds["start_plot"] = cds["start"]
    cds["end_plot"] = cds["end"]




[docs]
def adjust_positions_sequences_df_center(
    gb_records: DataFrame, cds: DataFrame, size_longest_sequence: int
) -> None:
    """
    Adjust plotting coordinates to center-align each sequence and its CDS features.

    This function horizontally centers all sequences relative to the longest sequence.
    It modifies the `sequence_start` and `sequence_end` columns in `gb_records`,
    and adjusts the `start_plot` and `end_plot` coordinates in `cds`.

    If the sequences are not already left-aligned, they are first reset to the left
    using `adjust_positions_sequences_df_left`.

    Parameters
    ----------
    gb_records : pandas.DataFrame
        DataFrame containing metadata for GenBank sequences. Must include:
        - 'length', 'sequence_start', 'sequence_end', and 'file_number'.

    cds : pandas.DataFrame
        DataFrame containing CDS feature metadata. Must include:
        - 'file_number', 'start_plot', and 'end_plot'.

    size_longest_sequence : int
        Length of the longest sequence in the dataset. Used to compute the centering shift.
    """
    # Check if sequences are at the left. If not, reset the values to the left
    if not check_if_sequences_are_at_left(cds):
        adjust_positions_sequences_df_left(gb_records, cds)
    # Iterate over gb_records rows to find the shift value
    for i, row in gb_records.iterrows():
        # Get value to shift sequences to the center
        shift = (size_longest_sequence - row["length"]) / 2
        # Change the values of the sequence_start and sequence_end of gb_records
        gb_records.loc[i, "sequence_start"] = row["sequence_start"] + shift
        gb_records.loc[i, "sequence_end"] = row["sequence_end"] + shift
        # Change the values of start_plot and end_plot of the cds DataFrame
        cds.loc[cds["file_number"] == i, "start_plot"] += shift
        cds.loc[cds["file_number"] == i, "end_plot"] += shift




[docs]
def adjust_positions_sequences_df_right(
    gb_records: DataFrame, cds: DataFrame, size_longest_sequence: int
) -> None:
    """
    Adjust plotting coordinates to right-align sequences and CDS features.

    This function horizontally right-aligns each sequence relative to the longest
    sequence. It updates the `sequence_start` and `sequence_end` columns in
    `gb_records`, and adjusts the CDS plotting coordinates (`start_plot`, `end_plot`)
    in `cds`.

    If the sequences are not already left-aligned, they are reset using
    `adjust_positions_sequences_df_left`.

    Parameters
    ----------
    gb_records : pandas.DataFrame
        DataFrame containing GenBank sequence metadata. Must include:
        - 'length', 'sequence_start', 'sequence_end', and 'file_number'.

    cds : pandas.DataFrame
        DataFrame containing CDS feature metadata. Must include:
        - 'file_number', 'start_plot', and 'end_plot'.

    size_longest_sequence : int
        Length of the longest sequence in the dataset. Used to calculate the shift
        needed to right-align shorter sequences.
    """
    # Check if sequences are at the left. If not, reset the values to the left
    if not check_if_sequences_are_at_left(cds):
        adjust_positions_sequences_df_left(gb_records, cds)
    # Iterate over gb_records rows to find the shift value
    for i, row in gb_records.iterrows():
        # Get value to shift sequences to the center
        shift = size_longest_sequence - row["length"]
        # Change the values of the sequence_start and sequence_end of gb_records
        gb_records.loc[i, "sequence_start"] += shift
        gb_records.loc[i, "sequence_end"] += shift
        # Change the values of start_plot and end_plot of the cds DataFrame
        cds.loc[cds["file_number"] == i, "start_plot"] += shift
        cds.loc[cds["file_number"] == i, "end_plot"] += shift




[docs]
def adjust_positions_alignments_df_left(regions: DataFrame) -> None:
    """
    Reset alignment plotting coordinates to their original (left-aligned) positions.

    This function sets the plotting coordinates (`*_plot` columns) of BLAST alignment
    regions to match their original values from the BLAST output.

    Parameters
    ----------
    regions : pandas.DataFrame
        DataFrame containing BLAST alignment region metadata.
        Must include the following columns:
        - 'query_from', 'query_to', 'hit_from', 'hit_to'
        - 'query_from_plot', 'query_to_plot', 'hit_from_plot', 'hit_to_plot'
    """
    # Reset values
    regions["query_from_plot"] = regions["query_from"]
    regions["query_to_plot"] = regions["query_to"]
    regions["hit_from_plot"] = regions["hit_from"]
    regions["hit_to_plot"] = regions["hit_to"]




[docs]
def adjust_positions_alignments_df_center(
    alignments: DataFrame, regions: DataFrame, size_longest_sequence: int
) -> None:
    """
    Center-align alignment regions for plotting relative to the longest sequence.

    This function adjusts the plotting coordinates of each alignment region to center
    both the query and hit sequences. It shifts the `*_plot` columns (`query_from_plot`,
    `query_to_plot`, `hit_from_plot`, `hit_to_plot`) based on the difference between
    each alignment's sequence length and the longest sequence in the dataset.

    If alignments are not already left-aligned, they are reset using
    `adjust_positions_alignments_df_left()`.

    Parameters
    ----------
    alignments : pandas.DataFrame
        DataFrame containing BLAST alignment summary metadata. Must include:
            - 'alignment_number', 'query_len', 'hit_len'.

    regions : pandas.DataFrame
        DataFrame containing BLAST alignment region metadata. Must include:
            - 'alignment_number', 'query_from_plot', 'query_to_plot',
              'hit_from_plot', 'hit_to_plot'.

    size_longest_sequence : int
        Length of the longest sequence in the dataset. Used to calculate the centering
        shift.
    """
    # Check if alignments are at the left. If not, reset the values to the left
    if not check_if_alignments_are_at_left(regions):
        adjust_positions_alignments_df_left(regions)
    # Iterate over alignments to find the shift value
    for i, alignment in alignments.iterrows():
        # Find the amount to add to shift the alignments the the center.
        shift_q = (size_longest_sequence - alignment["query_len"]) / 2
        shift_h = (size_longest_sequence - alignment["hit_len"]) / 2
        # Change the values of the regions used for plotting.
        regions.loc[regions["alignment_number"] == i, "query_from_plot"] += shift_q
        regions.loc[regions["alignment_number"] == i, "query_to_plot"] += shift_q
        regions.loc[regions["alignment_number"] == i, "hit_from_plot"] += shift_h
        regions.loc[regions["alignment_number"] == i, "hit_to_plot"] += shift_h




[docs]
def adjust_positions_alignments_df_right(
    alignments: DataFrame, regions: DataFrame, size_longest_sequence: int
) -> None:
    """
    Right-align alignment regions for plotting relative to the longest sequence.

    This function shifts the `*_plot` coordinates (`query_from_plot`, `query_to_plot`,
    `hit_from_plot`, `hit_to_plot`) of each region so that both query and hit
    alignments appear right-aligned in the plot.

    If alignments are not already left-aligned, they are reset using
    `adjust_positions_alignments_df_left()`.

    Parameters
    ----------
    alignments : pandas.DataFrame
        DataFrame containing summary metadata for each alignment. Must include:
            - 'alignment_number', 'query_len', and 'hit_len'.

    regions : pandas.DataFrame
        DataFrame with alignment region metadata. Must include:
            - 'alignment_number', 'query_from_plot', 'query_to_plot',
              'hit_from_plot', and 'hit_to_plot'.

    size_longest_sequence : int
        The length of the longest sequence in the dataset. Used to compute the
        right-shift offset for alignment display.
    """
    # Check if alignments are at the left. If not, reset the values to the left
    if not check_if_alignments_are_at_left(regions):
        adjust_positions_alignments_df_left(regions)
    # Iterate over alignments to find the shift value
    for i, alignment in alignments.iterrows():
        # Find the amount to add to shift the alignments the the right.
        delta_query = size_longest_sequence - alignment["query_len"]
        delta_hit = size_longest_sequence - alignment["hit_len"]
        # Change the values fo the regions used for plotting.
        regions.loc[regions["alignment_number"] == i, "query_from_plot"] += delta_query
        regions.loc[regions["alignment_number"] == i, "query_to_plot"] += delta_query
        regions.loc[regions["alignment_number"] == i, "hit_from_plot"] += delta_hit
        regions.loc[regions["alignment_number"] == i, "hit_to_plot"] += delta_hit




[docs]
def adjust_positions_sequences_and_alignments_df_for_plotting(
    gb_records: DataFrame,
    cds: DataFrame,
    alignments: DataFrame,
    regions: DataFrame,
    size_longest_sequence: None | int = None,
    position: str = "left",
) -> None:
    """
    Adjust plotting coordinates for sequences, CDS features, and alignments.

    This function dispatches layout adjustment functions to shift the positions of
    sequences, genes (CDS), and alignment regions based on the desired layout:
    left-, center-, or right-aligned. It modifies the relevant plotting columns
    (`*_plot`) in-place.

    Parameters
    ----------
    gb_records : pandas.DataFrame
        DataFrame containing metadata for GenBank sequences.
        Must include 'length', 'sequence_start', 'sequence_end', and 'file_number'.

    cds : pandas.DataFrame
        DataFrame containing CDS metadata with columns such as 'start', 'end',
        'start_plot', 'end_plot', and 'file_number'.

    alignments : pandas.DataFrame
        DataFrame summarizing each alignment. Must include 'alignment_number',
        'query_len', and 'hit_len'.

    regions : pandas.DataFrame
        DataFrame describing aligned regions between sequences.
        Must include 'alignment_number', and the columns:
        'query_from_plot', 'query_to_plot', 'hit_from_plot', 'hit_to_plot'.

    size_longest_sequence : int or None, optional
        Length of the longest sequence, used when centering or right-aligning. Not
        required if `position="left"`.

    position : str, default="left"
        Layout alignment option for plotting. Must be one of: "left", "center", or "right".
    """
    if position == "left":
        adjust_positions_sequences_df_left(gb_records=gb_records, cds=cds)
        adjust_positions_alignments_df_left(regions=regions)
    if position == "center":
        adjust_positions_sequences_df_center(
            gb_records=gb_records,
            cds=cds,
            size_longest_sequence=size_longest_sequence,
        )
        adjust_positions_alignments_df_center(
            alignments=alignments,
            regions=regions,
            size_longest_sequence=size_longest_sequence,
        )
    if position == "right":
        adjust_positions_sequences_df_right(
            gb_records=gb_records,
            cds=cds,
            size_longest_sequence=size_longest_sequence,
        )
        adjust_positions_alignments_df_right(
            alignments=alignments,
            regions=regions,
            size_longest_sequence=size_longest_sequence,
        )




[docs]
def check_if_alignments_are_at_left(regions: DataFrame) -> bool:
    """
    Check whether alignment regions are left-aligned.

    This function compares the plotting start coordinates (`query_from_plot`) with the
    original BLAST start coordinates (`query_from`). If they match for all rows, the
    function returns True, indicating that no offset has been applied.

    Parameters
    ----------
    regions : pandas.DataFrame
        DataFrame containing alignment region metadata.
        Must include the columns 'query_from' and 'query_from_plot'.

    Returns
    -------
    bool
        True if all alignment regions are left-aligned, False otherwise.
    """
    left = regions["query_from_plot"].equals(regions["query_from"])
    return left




[docs]
def check_if_sequences_are_at_left(cds: DataFrame):
    """
    Check whether CDS features are left-aligned for plotting.

    This function compares the plotting start positions (`start_plot`) to the original
    genomic start positions (`start`) for all coding sequences. If all rows match, the
    sequences are considered left-aligned.

    Parameters
    ----------
    cds : pandas.DataFrame
        DataFrame containing CDS feature metadata.
        Must include 'start' and 'start_plot' columns.

    Returns
    -------
    bool
        True if all CDS start positions are left-aligned, False otherwise.
    """
    left = cds["start_plot"].equals(cds["start"])
    return left



if __name__ == "__main__":
    # # test
    # xml1 = Path(
    #     "/Users/msp/Documents/Coding/python_projects/HomologyViz/data/SW4848_paper/result0.xml"
    # )
    # xml2 = Path(
    #     "/Users/msp/Documents/Coding/python_projects/HomologyViz/data/SW4848_paper/result1.xml"
    # )
    # alignments_df, regions_df = blast_alignments_to_dataframe([xml1, xml2])

    # print(alignments_df)
    # print(regions_df["homology"].min())

    gb1 = Path(
        "/Users/msp/Documents/Coding/python_projects/HomologyViz/data/SW4848_paper/Tn21.gb"
    )
    gb_df, cds_df = genbank_files_metadata_to_dataframes([gb1])
    # print(cds_df)
    print(cds_df)
    # cds_df_groups = cds_df.groupby(["file_number"])
    # print(f"the length of the groups is: {len(cds_df_groups)}")
    # for file_number, group in cds_df_groups:
    #     print(file_number)
    #     print(group)