"""
Functions and classes to generate graphical representations of BLASTn alignments.
Homology Visualization (HomologyViz) uses GenBank files (.gb) to align sequences, plot
genes, and visualize homology regions. Gene features are extracted from the `CDS` section
of each GenBank file.
Gene color customization is supported by adding a `/Color` qualifier to a `CDS` feature.
For example:
/Color="#00ff00"
will render the gene in green. If no color is provided, HomologyViz defaults to yellow.
Notes
-----
- This file is part of HomologyViz
- BSD 3-Clause License
- Copyright (c) 2024, Iván Muñoz Gutiérrez
"""
from pathlib import Path
from pandas import DataFrame
import plotly.graph_objects as go
from plotly.graph_objects import Figure, Heatmap
import plotly.colors as colors
import numpy as np
import matplotlib.colors as mcolors
from homologyviz.parameters import PlotParameters
from homologyviz.arrow import Arrow
from homologyviz.rectangle_bezier import RectangleCurveHeight
from homologyviz import gb_files_manipulation as genbank
from homologyviz import miscellaneous as misc
# TODO: The remove_annotations_by_name needs checking because Plotly annotations can be
# stored as Layout.annotation.Annotation objects, not just dicts. We may get a
# TypeError on annotation["name"] so we might need to switch to annotation.name
[docs]
def create_color_line(colors: str) -> Figure:
"""
Create a continuous horizontal color gradient to display a selected colorscale.
This function takes input from the dropdown menu in the `Edit` tab of the app, which
lists all Plotly sequential colorscales (e.g., "Greys", "Greens", "GnBu", "Blues_r").
It generates a slim heatmap that visually represents the full range of the selected
colorscale, helping users preview how the color mapping will appear when applied
to homology identity values.
Parameters
----------
colors : str
A Plotly sequential colorscale name (e.g., "Greys", "Greens", "GnBu", and
"Blues_r").
Returns
-------
figure : plotly.graph_objects.Figure
A Plotly figure showing a smooth horizontal color gradient without axes or labels.
"""
# Create a large number of z values for a smooth gradient (e.g., 1000 values)
z_values = np.linspace(0, 1, 1000).reshape(1, -1) # 1 row of 1000 values
# Create heatmap with fine z-values
figure = Figure(
Heatmap(
z=z_values, # Fine-grained z-values
colorscale=colors,
showscale=False, # Disable the color bar
xgap=0,
ygap=0,
)
)
figure.update_layout(
xaxis=dict(visible=False), # hide x-axis
yaxis=dict(visible=False), # Hide y-axis
height=40, # Adjust height to display the line clearly
margin=dict(l=0, r=0, t=0, b=0), # Remove margins around the figure
plot_bgcolor="white", # Set background color to white
)
return figure
[docs]
def get_color_from_colorscale(value: float, colorscale_name: str = "Greys") -> str:
"""
Retrieve an RGB color from a Plotly colorscale based on a normalized value.
This function samples a color from the specified Plotly colorscale using a
value between 0 and 1. It's useful for mapping numerical data (e.g., identity
percentages) to a corresponding color.
Parameters
----------
value : float
A normalized value between 0 and 1 indicating the position within the colorscale.
colorscale_name : str, default="Greys"
The name of the Plotly colorscale to use (e.g., "Greys", "Blues", "Viridis").
Returns
-------
str
A string representing the RGB color (e.g., "rgb(200, 200, 200)").
"""
# Sample the color from the colorscale
rgb_color = colors.sample_colorscale(colorscale_name, [value])[0]
return rgb_color
[docs]
def get_truncated_colorscale(
colorscale_name: str = "Greys",
vmin: float = 0,
vmax: float = 0.75,
n_samples: int = 256,
) -> list[tuple[float, str]]:
"""
Generate a truncated Plotly colorscale between two normalized values.
This function samples a subset of a Plotly colorscale between `vmin` and `vmax`,
returning a list of (position, color) tuples. It is useful for focusing a colorscale
on a specific range of values (e.g., homology identity percentages).
For the "Greys" colorscale, the `vmax` is capped at 0.99 to prevent rendering issues
that may cause the darkest values to appear incorrectly when `vmax=1`.
Parameters
----------
colorscale_name : str, default="Greys"
The name of the Plotly colorscale to sample from.
vmin : float, default=0
The lower bound of the normalized range (between 0 and 1).
vmax : float, default=0.75
The upper bound of the normalized range (between 0 and 1).
n_samples : int, default=256
Number of samples to generate across the specified range.
Returns
-------
list of tuple[float, str]
A list of (normalized_position, color_string) tuples representing the truncated
colorscale.
"""
# IMPORTANT: This fix a problem with Greys set to 100% (i.e. vmax 1) that shows
# shadows with off colors.
if colorscale_name == "Greys" and vmax == 1:
vmax = 0.99
values = np.linspace(vmin, vmax, n_samples)
truncated_colors = colors.sample_colorscale(colorscale_name, values)
return truncated_colors
[docs]
def sample_from_truncated_colorscale(
truncated_colorscale: list[tuple[float, str]], homology_value: float
) -> str:
"""
Sample a color from a truncated colorscale based on a normalized homology value.
This function interpolates between the first and last colors in a truncated
Plotly colorscale using the given homology value (between 0 and 1), and returns
the corresponding RGB color.
Parameters
----------
truncated_colorscale : list of tuple[float, str]
A colorscale defined by a list of (normalized_position, color_string) tuples.
Typically produced by `get_truncated_colorscale`.
homology_value : float
A value between 0 and 1 indicating the relative position to sample within the
colorscale.
Returns
-------
str
The interpolated RGB color string (e.g., "rgb(100, 150, 200)").
"""
sampled_color = colors.find_intermediate_color(
truncated_colorscale[0], # The first color in the truncated colorscale
truncated_colorscale[-1], # The last color in the truncated colorscale
homology_value, # The input value between 0 and 1
colortype="rgb", # Return the color in RGB format
)
return sampled_color
[docs]
def sample_colorscale_setting_lowest_and_highest_homologies(
truncated_colorscale: list[tuple[float, str]],
homology_value: float,
lowest_homology: float,
highest_homology: float,
) -> str:
"""
Sample a color from a truncated colorscale based on actual homology value range.
This function maps a homology identity value to a position within a truncated
colorscale, taking into account the actual minimum and maximum identity values
in the dataset. This ensures consistent color mapping even when the colorscale
has been stretched or truncated.
Parameters
----------
truncated_colorscale : list of tuple[float, str]
A colorscale defined by a list of (normalized_position, color_string) tuples.
homology_value : float
The homology identity value to map (e.g., 0.83 for 83% identity).
lowest_homology : float
The lowest identity value present in the dataset (used to normalize the range).
highest_homology : float
The highest identity value present in the dataset.
Returns
-------
str
The interpolated RGB color string corresponding to the input homology value.
"""
delta_highest_to_lowest_homology = highest_homology - lowest_homology
delta_highest_to_value_homology = highest_homology - homology_value
if delta_highest_to_lowest_homology == 0:
value = 1.0
else:
value = (
1.0
- (delta_highest_to_value_homology * 1.0) / delta_highest_to_lowest_homology
)
sampled_color = colors.find_intermediate_color(
truncated_colorscale[0], # The first color in the truncated colorscale
truncated_colorscale[-1], # The last color in the truncated colorscale
value, # The input value between 0 and 1
colortype="rgb", # Return the color in RGB format
)
return sampled_color
[docs]
def plot_colorbar_legend(
fig: Figure,
colorscale: list[tuple[float, str]],
min_value: float,
max_value: float,
set_colorscale_to_extreme_homologies: bool = False,
) -> Figure:
"""
Add a horizontal colorbar legend to a Plotly figure to indicate homology identity
range.
This function creates a dummy scatter trace with a customized colorbar to serve
as a legend for the homology identity colors. It adjusts the color scale display
and tick labels based on whether the full identity range or a truncated scale
is being used.
Parameters
----------
fig : plotly.graph_objects.Figure
The Plotly figure to which the colorbar legend should be added.
colorscale : list of tuple[float, str]
A Plotly-compatible colorscale representing identity values.
min_value : float
The minimum identity value represented in the current color scale (normalized 0-1).
max_value : float
The maximum identity value represented in the current color scale (normalized 0-1).
set_colorscale_to_extreme_homologies : bool, default=False
If True, the color scale is stretched to the actual min and max identity values.
If False, it uses a truncated range (e.g., based on a user-defined threshold).
Returns
-------
fig : plotly.graph_objects.Figure
The updated Plotly figure with a horizontal identity colorbar legend.
"""
colorbar_len: float = 0.3
title_position: str = "bottom"
# Check if plot was set to set colorscale to extreme homologies
if min_value != max_value and set_colorscale_to_extreme_homologies:
updated_colorscale = colorscale
tickvals = [min_value, max_value]
ticktext = [f"{min_value*100:.2f}%", f"{max_value*100:.2f}%"]
if min_value != max_value and not set_colorscale_to_extreme_homologies:
updated_colorscale = get_truncated_colorscale(colorscale, min_value, max_value)
tickvals = [min_value, max_value]
ticktext = [f"{min_value*100:.2f}%", f"{max_value*100:.2f}%"]
# If min and max values are the same add only one tick value.
if min_value == max_value:
updated_colorscale = get_truncated_colorscale(colorscale, min_value, max_value)
tickvals = [max_value]
ticktext = [f"{max_value * 100:.2f}%"]
colorbar_len = 0.15
title_position = "right"
fig.add_trace(
go.Scatter(
y=[None], # Dummy values
x=[None], # Dummy values
customdata=[min_value, max_value],
name="colorbar legend",
mode="markers",
marker=dict(
colorscale=updated_colorscale,
cmin=min_value,
cmax=max_value,
color=[min_value, max_value],
colorbar=dict(
title=dict(
text="Identity", font=dict(size=18), side=title_position
),
orientation="h",
x=0.75,
xanchor="center",
y=-0.15,
yanchor="top",
tickfont=dict(size=18),
tickvals=tickvals,
ticktext=ticktext,
len=colorbar_len,
),
),
hoverinfo="none",
)
)
return fig
[docs]
def plot_polygon(
fig: Figure,
x_values: list,
y_values: list,
color: str = "blue",
name: str = "",
customdata: list = [],
visible: bool = True,
) -> None:
"""
Add a filled polygon to the Plotly figure to represent a gene or homology region.
This function draws a closed shape by connecting the points defined by `x_values`
and `y_values`, and fills it with the specified color. It is used to visually
represent genes, coding regions, or homology blocks on the plot.
Parameters
----------
fig : plotly.graph_objects.Figure
The Plotly figure to which the polygon should be added.
x_values : list
The x-coordinates of the polygon vertices.
y_values : list
The y-coordinates of the polygon vertices.
color : str, default="blue"
The color used for both the polygon outline and fill (RGB or hex format).
name : str, optional
A label used for hovering (displayed as `%{text}`).
customdata : list, optional
Extra data attached to each vertex, accessible in callbacks or hover events.
visible : bool, default=True
Whether the polygon is initially visible in the plot.
Returns
-------
None
The function modifies the input figure in-place and returns nothing.
"""
fig.add_trace(
go.Scatter(
x=x_values,
y=y_values,
text=name,
fill="toself",
mode="lines",
line=dict(color=color, width=1),
fillcolor=color,
name="",
customdata=customdata,
hoverlabel=dict(font_size=14),
hovertemplate="%{text}<extra></extra>",
visible=visible,
)
)
[docs]
def plot_line(
fig: Figure,
x_values: list,
y_values: list,
name: str | None = None,
customdata: list = [],
color: str = "black",
) -> None:
"""
Add a thick horizontal line to the Plotly figure to represent a DNA sequence.
This function draws a straight line between the specified x and y coordinates.
It is typically used to visualize the backbone of each DNA sequence in the alignment.
Parameters
----------
fig : plotly.graph_objects.Figure
The Plotly figure to which the line should be added.
x_values : list
The x-coordinates defining the start and end of the line.
y_values : list
The y-coordinates defining the vertical position of the line (usually constant).
name : str or None, optional
Name label used for hover display and trace identification.
customdata : list, optional
Extra data attached to the line, accessible via hover or callbacks.
color : str, default="black"
The color of the line (e.g., "black", "#FF0000", or "rgb(0,0,0)").
Returns
-------
None
The function modifies the input figure in-place and returns nothing.
"""
fig.add_trace(
go.Scatter(
x=x_values,
y=y_values,
mode="lines",
name=name,
customdata=customdata,
line=dict(color=color, width=4),
hoverlabel=dict(font_size=14),
hovertemplate=f"{name}<extra></extra>",
)
)
[docs]
def plot_dna_sequences(
fig: Figure,
gb_records: DataFrame,
y_separation: int = 10,
) -> Figure:
"""
Plot horizontal lines representing DNA sequences using metadata from a DataFrame.
This function iterates over a DataFrame of GenBank metadata and draws one horizontal
line per sequence, spaced vertically by `y_separation`. Each line represents a DNA
sequence from the dataset and is labeled with the sequence name.
Parameters
----------
fig : plotly.graph_objects.Figure
The Plotly figure to which the sequence lines will be added.
gb_records : pandas.DataFrame
A DataFrame containing metadata for each sequence, including:
- 'sequence_start'
- 'sequence_end'
- 'record_name'
y_separation : int, default=10
The vertical distance between stacked sequence lines.
Returns
-------
fig : plotly.graph_objects.Figure
The updated Plotly figure with all DNA sequences plotted as horizontal lines.
"""
y_distance = len(gb_records) * y_separation
for _, row in gb_records.iterrows():
x1 = row["sequence_start"]
x2 = row["sequence_end"]
record_name = row["record_name"]
x_values = np.array([x1, x2])
y_values = np.array([y_distance, y_distance])
trace_name = f"Sequence: {record_name}"
plot_line(
fig,
x_values,
y_values,
name=trace_name,
)
y_distance -= y_separation
return fig
[docs]
def plot_genes(
fig: Figure,
number_gb_records: int,
longest_sequence: int,
cds_records: DataFrame,
name_from: str = "product",
y_separation: int = 10,
) -> Figure:
"""
Plot arrows representing genes using metadata from a CDS DataFrame.
This function iterates over coding sequence (CDS) entries grouped by GenBank file.
Each gene is rendered as a directional polygon (arrow) positioned on the y-axis
according to the file order. Arrowhead size is scaled relative to the longest sequence
to maintain proportionality.
Parameters
----------
fig : plotly.graph_objects.Figure
The Plotly figure to which gene arrows will be added.
number_gb_records : int
The total number of GenBank files, used to calculate vertical placement.
longest_sequence : int
The length of the longest DNA sequence, used to scale arrowhead size.
cds_records : pandas.DataFrame
A DataFrame containing gene metadata. Expected columns include:
- 'file_number'
- 'cds_number'
- 'start_plot'
- 'end_plot'
- 'color'
- 'product'
- 'gene'
name_from : str, default="product"
Determines the label shown for each gene: either "product" or "gene".
y_separation : int, default=10
Vertical spacing between rows of gene arrows for different GenBank files.
Returns
-------
fig : plotly.graph_objects.Figure
The updated Plotly figure with gene arrows plotted for each sequence.
"""
# Position of the first DNA sequence in the y axis. Plotting starts at the top.
y = number_gb_records * y_separation
# Ratio head_height vs lenght of longest sequence
ratio = 0.02
head_height = longest_sequence * ratio
# Iterate over gb_records dataframe to plot genes.
for _, cds_group in cds_records.groupby(["file_number"]):
for _, row in cds_group.iterrows():
x1 = row["start_plot"]
x2 = row["end_plot"]
color = row["color"]
file_number = row["file_number"]
cds_number = row["cds_number"]
arrow = Arrow(x1=x1, x2=x2, y=y, head_height=head_height)
x_values, y_values = arrow.get_coordinates()
# Get name and check if is None
name = row["product"] if name_from == "product" else row["gene"]
name = name if name is not None else "no name"
plot_polygon(
fig,
x_values,
y_values,
color=color,
name=name,
customdata=[file_number, cds_number, name, color],
)
y -= y_separation
return fig
[docs]
def plot_homology_regions_with_dataframe(
fig: Figure,
alignments_df: DataFrame,
regions_df: DataFrame,
y_separation: int = 10,
homology_padding: float = 1.1,
colorscale: str = "Greys",
straight_heights: bool = True,
minimum_homology_length: int = 1,
set_colorscale_to_extreme_homologies: bool = False,
lowest_homology: None | float = None,
highest_homology: None | float = None,
) -> Figure:
"""
Plot homology regions as filled polygons using metadata from alignment dataframes.
This function visualizes BLASTn alignment regions as colored polygons between
query and subject sequences. The color represents the identity percentage, based
on a colorscale that may be truncated or stretched to fit user-defined identity
bounds.
Parameters
----------
fig : plotly.graph_objects.Figure
The Plotly figure to which homology polygons will be added.
alignments_df : pandas.DataFrame
A DataFrame with metadata about BLAST alignments. Expected columsn include:
- 'alignment_number', 'query_name', 'hit_name', 'query_len', and 'hit_len'.
regions_df : pandas.DataFrame
A DataFrame containing the coordinates of individual homology blocks to plot.
Expected columns include:
- 'alignment_number', 'query_from_plot', 'query_to_plot',
- 'hit_from_plot', 'hit_to_plot', and 'homology'.
y_separation : int, default=10
Vertical distance between stacked alignment rows.
homology_padding : float, default=1.1
Padding added to the y-coordinates to offset polygon positioning for visual
clarity.
colorscale : str
Name of the Plotly colorscale used to color-code identity values.
straight_heights : bool, default=True
If True, polygons are drawn with flat tops and bottoms.
If False, curved edges are used for a smoother appearance.
minimum_homology_length : int, default=1
Minimum length (in bp) for a homology region to be displayed.
set_colorscale_to_extreme_homologies : bool, default=False
If True, colors are scaled using the dataset's actual min/max identity values.
lowest_homology : float or None, optional
The minimum identity value used for scaling (required if
`set_colorscale_to_extreme_homologies` is True).
highest_homology : float or None, optional
The maximum identity value used for scaling (required if
`set_colorscale_to_extreme_homologies` is True).
Returns
-------
fig : plotly.graph_objects.Figure
The updated Plotly figure with homology regions plotted as polygons.
"""
# Get length of alignments and add 1
alignments_len = len(alignments_df) + 1
# Get the y distance to start plotting at the top of the graph
y_distances = (alignments_len) * y_separation
# Iterate over homologous regions for plotting
for i, region_group in regions_df.groupby(["alignment_number"]):
for _, row in region_group.iterrows():
# Get region coordinates
x1 = row["query_from_plot"]
x2 = row["query_to_plot"]
x3 = row["hit_to_plot"]
x4 = row["hit_from_plot"]
y1 = y_distances - homology_padding
y2 = y_distances - homology_padding
y3 = y_distances - y_separation + homology_padding
y4 = y_distances - y_separation + homology_padding
homology_length = x2 - x1
# If homology length is less or equalts to the minimun required, ignore it
if homology_length <= minimum_homology_length:
visible = False
else:
visible = True
# If user requested straight lines convert coordinates to np.array
if straight_heights:
xpoints = np.array([x1, x2, x3, x4, x1])
ypoints = np.array([y1, y2, y3, y4, y1])
# Otherwise, convert coordinates to bezier coordinates.
else:
xpoints, ypoints = RectangleCurveHeight(
x_coordinates=[x1, x2, x3, x4],
y_coordinates=[y1, y2, y3, y4],
proportions=[0, 0.2, 0.8, 1],
).coordinates_rectangle_height_bezier()
# Get the identity to match with the correct color.
homology = row["homology"]
# Sample color depending on how the user set the colorscale
if set_colorscale_to_extreme_homologies:
color = sample_colorscale_setting_lowest_and_highest_homologies(
truncated_colorscale=colorscale,
homology_value=homology,
lowest_homology=lowest_homology,
highest_homology=highest_homology,
)
else:
color = sample_from_truncated_colorscale(
truncated_colorscale=colorscale,
homology_value=homology,
)
customdata = ["identity", homology, homology_length]
plot_polygon(
fig,
xpoints,
ypoints,
color=color,
name=f"Identity: {homology*100:.2f}%",
customdata=customdata,
visible=visible,
)
y_distances -= y_separation
return fig
[docs]
def annotate_dna_sequences(
fig: Figure,
gb_records: DataFrame,
longest_sequence: int,
number_gb_records: int,
annotate_with: str = "accession",
y_separation: int = 10,
padding: int = 10,
) -> Figure:
"""
Add text annotations to DNA sequence lines using metadata from GenBank records.
This function appends labels to the right of each sequence line, based on user
preference. Labels can include the sequence's accession number, internal name, or
original filename. The annotations are positioned with consistent vertical spacing.
Parameters
----------
fig : plotly.graph_objects.Figure
The Plotly figure to which annotations will be added.
gb_records : pandas.DataFrame
A DataFrame containing metadata for each GenBank sequence. Expected columns
include:
- 'accession', 'record_name', and 'file_name'.
longest_sequence : int
Length of the longest sequence, used to offset the annotation on the x-axis.
number_gb_records : int
Total number of sequences to be annotated (used to calculate vertical placement).
annotate_with : str, default="accession"
Field to use for annotation. Must be one of:
- "accession" (e.g., NCBI ID)
- "name" (internal record name)
- "fname" (original file name)
y_separation : int, default=10
Vertical spacing between each sequence line.
padding : int, default=10
Horizontal space between the end of the longest sequence and its annotation.
Returns
-------
fig : plotly.graph_objects.Figure
The updated Plotly figure with sequence annotations added.
"""
y = y_separation * number_gb_records
options = {
"accession": "accession",
"record_name": "record_name",
"file_name": "file_name",
"custom_name": "custom_name",
}
option = options.get(annotate_with, "accession")
for _, row in gb_records.iterrows():
name = row[option]
print(f"option: {option}")
print(f"add this name: {name}")
fig.add_annotation(
x=longest_sequence + padding,
xref="x",
y=y,
name=f"Sequence annotation: {name}",
text=name,
font=dict(size=18),
showarrow=False,
xanchor="left",
yanchor="middle",
)
y -= y_separation
return fig
[docs]
def annotate_top_genes(
fig: Figure,
annotate_genes_with: str,
number_gb_records: int,
cds_records: DataFrame,
y_separation: int = 10,
) -> Figure:
"""
Add vertical text annotations for genes on the top sequence row.
This function annotates genes from the first GenBank file (`file_number == 0`)
using a specified metadata field (e.g., gene name or product). Annotations are
positioned above the top row of gene arrows and are rotated vertically.
Parameters
----------
fig : plotly.graph_objects.Figure
The Plotly figure to which annotations will be added.
annotate_genes_with : str
The metadata field to use for labeling each gene (e.g., "gene", "product").
number_gb_records : int
Total number of GenBank records, used to calculate the top y-axis position.
cds_records : pandas.DataFrame
A DataFrame containing CDS metadata. Must include the following columns:
- 'file_number', 'start_plot', 'end_plot',
- and the column specified by `annotate_genes_with` ('gene' and 'product').
y_separation : int, default=10
Vertical spacing between sequence rows; controls annotation height.
Returns
-------
fig : plotly.graph_objects.Figure
The updated Plotly figure with gene annotations added above the top sequence.
"""
# Filter rows to get the ones that belong to the top sequence
df_file_number_0 = cds_records.loc[cds_records["file_number"] == 0]
y = y_separation * number_gb_records
# Iterate over rows of file_number 0 to annotate genes
for _, row in df_file_number_0.iterrows():
x_start = row["start_plot"]
x_end = row["end_plot"]
x = (x_start + x_end) / 2
name = row[annotate_genes_with]
fig.add_annotation(
x=x,
y=y + 1.1,
text=name,
name=f"Gene annotation: {name}",
showarrow=False,
textangle=270,
font=dict(size=16),
xanchor="center",
yanchor="bottom",
)
return fig
[docs]
def annotate_all_genes_above(
fig: Figure,
annotate_genes_with: str,
number_gb_records: int,
cds_records: DataFrame,
y_separation: int = 10,
) -> Figure:
"""
Add vertical text annotations above all genes.
This function annotates genes using a specified metadata field (e.g., gene name or
product). Annotations are positioned above gene arrows and are rotated vertically.
Parameters
----------
fig : plotly.graph_objects.Figure
The Plotly figure to which annotations will be added.
annotate_genes_with : str
The metadata field to use for labeling each gene (e.g., "gene", "product").
number_gb_records : int
Total number of GenBank records, used to calculate the top y-axis position.
cds_records : pandas.DataFrame
A DataFrame containing CDS metadata. Must include the following columns:
- 'file_number', 'start_plot', 'end_plot',
- and the column specified by `annotate_genes_with` ('gene' and 'product').
y_separation : int, default=10
Vertical spacing between sequence rows; controls annotation height.
Returns
-------
fig : plotly.graph_objects.Figure
The updated Plotly figure with gene annotations added above the top sequence.
"""
y = y_separation * number_gb_records
for record in range(number_gb_records):
# Iterate over rows of file_number matching record number
df_record = cds_records.loc[cds_records["file_number"] == record]
for _, row in df_record.iterrows():
x_start = row["start_plot"]
x_end = row["end_plot"]
x = (x_start + x_end) / 2
name = row[annotate_genes_with]
fig.add_annotation(
x=x,
y=y + 1.1,
text=name,
name=f"Gene annotation: {name}",
showarrow=False,
textangle=270,
font=dict(size=16),
xanchor="center",
yanchor="bottom",
)
y -= y_separation
return fig
[docs]
def annotate_all_genes_below(
fig: Figure,
annotate_genes_with: str,
number_gb_records: int,
cds_records: DataFrame,
y_separation: int = 10,
) -> Figure:
"""
Add vertical text annotations below all genes.
This function annotates genes using a specified metadata field (e.g., gene name or
product). Annotations are positioned below gene arrows and are rotated vertically.
Parameters
----------
fig : plotly.graph_objects.Figure
The Plotly figure to which annotations will be added.
annotate_genes_with : str
The metadata field to use for labeling each gene (e.g., "gene", "product").
number_gb_records : int
Total number of GenBank records, used to calculate the top y-axis position.
cds_records : pandas.DataFrame
A DataFrame containing CDS metadata. Must include the following columns:
- 'file_number', 'start_plot', 'end_plot',
- and the column specified by `annotate_genes_with` ('gene' and 'product').
y_separation : int, default=10
Vertical spacing between sequence rows; controls annotation height.
Returns
-------
fig : plotly.graph_objects.Figure
The updated Plotly figure with gene annotations added above the top sequence.
"""
y = y_separation * number_gb_records
for record in range(number_gb_records):
# Iterate over rows of file_number matching record number
df_record = cds_records.loc[cds_records["file_number"] == record]
for _, row in df_record.iterrows():
x_start = row["start_plot"]
x_end = row["end_plot"]
x = (x_start + x_end) / 2
name = row[annotate_genes_with]
fig.add_annotation(
x=x,
y=y - 1.1,
text=name,
name=f"Gene annotation: {name}",
showarrow=False,
textangle=270,
font=dict(size=16),
xanchor="center",
yanchor="top",
)
y -= y_separation
return fig
[docs]
def annotate_bottom_genes(
fig: Figure,
annotate_genes_with: str,
number_gb_records: int,
cds_records: DataFrame,
y_separation: int = 10,
) -> Figure:
"""
Add vertical text annotations for genes on the bottom sequence row.
This function annotates genes from the last GenBank file (`file_number ==
number_gb_records - 1`) using a specified metadata field (e.g., gene name or product).
Annotations are positioned below the bottom row of gene arrows and rotated vertically
for readability.
Parameters
----------
fig : plotly.graph_objects.Figure
The Plotly figure to which annotations will be added.
annotate_genes_with : str
The metadata field to use for labeling each gene (e.g., "gene" or "product").
number_gb_records : int
Total number of GenBank records, used to identify the bottom sequence.
cds_records : pandas.DataFrame
A DataFrame containing CDS metadata. Must include the following columns:
- 'file_number', 'start_plot', 'end_plot'
- and the column specified by `annotate_genes_with` ('gene' or 'product').
y_separation : int, default=10
Vertical spacing between sequence rows; controls annotation height.
Returns
-------
fig : plotly.graph_objects.Figure
The updated Plotly figure with gene annotations added below the bottom sequence.
"""
# Filter rows to get the ones that belong to the bottom sequence
df_file_number_0 = cds_records.loc[
cds_records["file_number"] == number_gb_records - 1
]
y = y_separation
# Iterate over rows of file_number 0 to annotate genes
for _, row in df_file_number_0.iterrows():
x_start = row["start_plot"]
x_end = row["end_plot"]
x = (x_start + x_end) / 2
name = row[annotate_genes_with]
fig.add_annotation(
x=x,
y=y - 1.1,
text=name,
name=f"Gene annotation: {name}",
showarrow=False,
textangle=270,
font=dict(size=16),
xanchor="center",
yanchor="top",
)
return fig
[docs]
def annotate_genes(fig: Figure, plot_parameters: PlotParameters) -> Figure:
"""
Annotate gene features on the top, bottom, or all sequence rows based on user
preferences.
This function uses the `PlotParameters` object to determine how and where gene
annotations should be applied. It delegates to `annotate_top_genes` and/or
`annotate_bottom_genes` depending on the value of `plot_parameters.annotate_genes`.
Parameters
----------
fig : plotly.graph_objects.Figure
The Plotly figure to which gene annotations will be added.
plot_parameters : PlotParameters
An object containing configuration and metadata, including:
- `annotate_genes`: str, one of "top", "bottom", "top-bottom", "all-above", or
"all-below"
- `annotate_genes_with`: str, column name to use for labels (e.g., "gene",
"product")
- `cds_df`: DataFrame with CDS metadata
- `number_gb_records`: int, number of GenBank files
- `y_separation`: int, spacing between sequence rows
Returns
-------
fig : plotly.graph_objects.Figure
The updated Plotly figure with gene annotations applied according to settings.
"""
annotate_genes = plot_parameters.annotate_genes
if annotate_genes == "top" or annotate_genes == "top-bottom":
fig = annotate_top_genes(
fig=fig,
annotate_genes_with=plot_parameters.annotate_genes_with,
number_gb_records=plot_parameters.number_gb_records,
cds_records=plot_parameters.cds_df,
y_separation=plot_parameters.y_separation,
)
if annotate_genes == "bottom" or annotate_genes == "top-bottom":
fig = annotate_bottom_genes(
fig=fig,
annotate_genes_with=plot_parameters.annotate_genes_with,
number_gb_records=plot_parameters.number_gb_records,
cds_records=plot_parameters.cds_df,
y_separation=plot_parameters.y_separation,
)
if annotate_genes == "all-above":
fig = annotate_all_genes_above(
fig=fig,
annotate_genes_with=plot_parameters.annotate_genes_with,
number_gb_records=plot_parameters.number_gb_records,
cds_records=plot_parameters.cds_df,
y_separation=plot_parameters.y_separation,
)
if annotate_genes == "all-below":
fig = annotate_all_genes_below(
fig=fig,
annotate_genes_with=plot_parameters.annotate_genes_with,
number_gb_records=plot_parameters.number_gb_records,
cds_records=plot_parameters.cds_df,
y_separation=plot_parameters.y_separation,
)
return fig
[docs]
def remove_traces_by_name(figure: dict, name: str) -> dict:
"""
Remove all traces from a Plotly figure dictionary whose 'name' contains a given
substring.
This function is useful for removing dynamically generated traces (e.g., colorbar
legends, annotations, or temporary highlights) based on partial name matching.
Parameters
----------
figure : dict
A dictionary representing a Plotly figure (as returned by dcc.Graph).
name : str
A substring to match within the 'name' field of each trace. Any trace whose
name contains this string will be removed.
Returns
-------
dict
The updated figure dictionary with the matching traces removed.
"""
data = []
for trace in figure["data"]:
if ("name" not in trace) or (name not in trace["name"]):
data.append(trace)
figure["data"] = data
return figure
[docs]
def hide_homology(figure: Figure, min_homology: int) -> Figure:
"""
Toggle visibility of homology traces based on their length.
This function inspects each trace in the Plotly figure and hides those representing
homology regions whose length is less than or equal to `min_homology`. Length
information is expected in the trace's `customdata`.
The expected structure of `customdata` is:
[ "identity", identity_value: float, homology_length: int ]
Parameters
----------
figure : plotly.graph_objects.Figure
A Plotly figure object containing homology region traces with `customdata`.
min_homology : int
Minimum homology length required for a region to remain visible.
Returns
-------
figure : plotly.graph_objects.Figure
The updated figure with trace visibility toggled based on homology length.
"""
figure_type = type(figure)
print(f"the type of figure in hide_homology is: {figure_type}")
for trace in figure["data"]:
if trace["customdata"] is not None:
if "identity" in trace["customdata"]:
homology_length = int(trace["customdata"][2])
if homology_length <= min_homology:
trace.visible = False
if homology_length > min_homology:
trace.visible = True
return figure
[docs]
def remove_annotations_by_name(figure: Figure, name: str) -> Figure:
"""
Remove annotations from a Plotly figure whose 'name' contains the given substring.
This function filters `figure.layout['annotations']` and removes any annotations
where the 'name' field contains the specified string. It is useful for dynamically
updating or clearing labeled elements such as gene or sequence annotations.
Parameters
----------
figure : plotly.graph_objects.Figure
A Plotly figure object that may contain annotations.
name : str
A substring to search for in the 'name' field of each annotation.
Annotations matching this string will be removed.
Returns
-------
figure : plotly.graph_objects.Figure
The updated Plotly figure with matching annotations removed.
"""
annotations = []
for annotation in figure.layout["annotations"]:
if name in annotation["name"]:
continue
annotations.append(annotation)
annotations = tuple(annotations)
figure.layout["annotations"] = annotations
return figure
[docs]
def make_selection_effect(figure: Figure, curve_number: int) -> Figure:
"""
Visually highlight a selected trace by changing its line color and thickness.
This function adjusts the line color and width of a specific trace (based on its
`curve_number`) to create a selection effect. It uses the trace's current color to
determine whether to use a dark or light outline for optimal contrast.
Parameters
----------
figure : plotly.graph_objects.Figure
The Plotly figure containing the trace to modify.
curve_number : int
The index of the trace to update (corresponds to 'curveNumber' from Dash
clickData).
Returns
-------
figure : plotly.graph_objects.Figure
The updated figure with the specified trace visually emphasized.
"""
# Make effect of selection by changing the color of the line
default_black = "rgb(30, 30, 30)"
default_light = "rgb(230, 230, 230)"
color_curve = figure["data"][curve_number]["line"]["color"]
# if color is hex change to rgb list
if "#" in color_curve:
color_curve = mcolors.to_rgb(color_curve)
# otherwise convert rgb to list
else:
color_curve = color_curve.replace("rgb(", "").replace(")", "").split(",")
# normalize rgb
color_curve = [float(n) / 255 for n in color_curve]
# define lightness
lightness = (
0.2126 * color_curve[0] + 0.7152 * color_curve[1] + 0.0722 * color_curve[2]
)
# if color is too dark use make the color line light
if lightness < 0.3:
color_line = default_light
# else black
else:
color_line = default_black
# Update color line
figure["data"][curve_number]["line"]["color"] = color_line
figure["data"][curve_number]["line"]["width"] = 6
return figure
[docs]
def change_homology_color(
figure: dict,
colorscale_name: str,
vmin_truncate: float,
vmax_truncate: float,
set_colorscale_to_extreme_homologies: bool = False,
lowest_homology: None | float = None,
highest_homology: None | float = None,
) -> dict:
"""
Update the color of homology region traces based on identity values and a selected colorscale.
This function iterates over traces in a Plotly figure and updates the `fillcolor` and
`line.color` properties based on the homology identity value stored in `customdata`.
Color mapping is done using either a truncated or full-range colorscale.
Expected structure of `customdata` for homology regions:
[ "identity", identity_value: float, homology_length: int ]
Parameters
----------
figure : dict
A dictionary representation of a Plotly figure (typically from Dash state).
colorscale_name : str
Name of the Plotly colorscale to use (e.g., "Greys", "Viridis").
vmin_truncate : float
Lower bound (0-1) of the normalized identity range for color mapping.
vmax_truncate : float
Upper bound (0-1) of the normalized identity range for color mapping.
set_colorscale_to_extreme_homologies : bool, default=False
If True, stretch the color scale based on dataset-wide min/max identity values.
lowest_homology : float or None
The minimum identity value in the dataset, required if using extreme homology
scaling.
highest_homology : float or None
The maximum identity value in the dataset, required if using extreme homology
scaling.
Returns
-------
figure : dict
The updated Plotly figure dictionary with modified homology trace colors.
"""
# Get new colorscale
colorscale = get_truncated_colorscale(
colorscale_name=colorscale_name,
vmin=vmin_truncate,
vmax=vmax_truncate,
)
for trace in figure["data"]:
if "customdata" not in trace:
continue
if trace["customdata"] is None:
continue
if "identity" in trace["customdata"]:
# Get identity information from customdata
identity = trace["customdata"][1]
identity = float(identity)
# Sample colorscale with identity value.
if set_colorscale_to_extreme_homologies:
color = sample_colorscale_setting_lowest_and_highest_homologies(
truncated_colorscale=colorscale,
homology_value=identity,
lowest_homology=lowest_homology,
highest_homology=highest_homology,
)
else:
color = sample_from_truncated_colorscale(
truncated_colorscale=colorscale, homology_value=identity
)
trace["fillcolor"] = color
trace["line"]["color"] = color
return figure
[docs]
def plot_scale(
figure: Figure, length_longest_sequence: int, add_scale: bool = True
) -> Figure:
"""
Add a scale bar to the plot representing distance in base pairs (bp).
The scale length is calculated as one-fifth of the longest sequence, rounded up to the
nearest significant digit for visual clarity. A line and label are drawn at the bottom
of the figure to indicate scale. The scale can be hidden by setting `add_scale=False`.
Parameters
----------
figure : plotly.graph_objects.Figure
The Plotly figure to which the scale bar will be added.
length_longest_sequence : int
Length of the longest DNA sequence in the plot (used to size the scale).
add_scale : bool, default=True
Whether to display the scale. If False, a fully transparent line and label are
added.
Returns
-------
figure : plotly.graph_objects.Figure
The updated figure with the scale line and annotation.
"""
scale_length: int = misc.round_up_to_nearest_significant_digit(
length_longest_sequence / 5
)
color: str = "rgba(0, 0, 0, 1)" if add_scale else "rgba(0,0,0,0)"
# draw line representing scale
figure.add_shape(
type="line",
x0=0,
x1=scale_length,
y0=-0.15,
y1=-0.15,
xref="x",
yref="paper",
xanchor="center",
yanchor="top",
line=dict(color=color, width=4),
layer="above",
name="Scale bar",
)
# draw annotation to scale
figure.add_annotation(
x=scale_length / 2,
y=-0.15,
xref="x",
yref="paper",
xanchor="center",
yanchor="top",
text=f"{scale_length:,.0f} bp",
name="Scale annotation",
showarrow=False,
font=dict(size=18, color=color),
hovertext=None,
hoverlabel=None,
)
return figure
[docs]
def toggle_scale_bar(figure: Figure, show: bool) -> Figure:
"""
Toggle the visibility of the scale bar by adjusting its alpha channel.
This function searches the figure for the scale bar shape and annotation using
their names ("Scale bar" and "Scale annotation"). It modifies the color's
alpha value to either show or fully hide them without removing the elements.
Parameters
----------
figure : plotly.graph_objects.Figure
The Plotly figure containing the scale bar trace and annotation.
show : bool
If True, make the scale bar visible. If False, hide it using transparent colors.
Returns
-------
figure : plotly.graph_objects.Figure
The updated figure with the scale bar toggled on or off.
"""
color: str = "rgba(0, 0, 0, 1)" if show else "rgba(0,0,0,0)"
for shape in figure.layout.shapes or []:
if getattr(shape, "name", "") == "Scale bar":
if hasattr(shape, "line"):
shape.line.color = color
for annotation in figure.layout.annotations or []:
if getattr(annotation, "name", "") == "Scale annotation":
if hasattr(annotation, "font"):
annotation.font.color = color
return figure
[docs]
def add_or_remove_title(
fig: Figure,
title: str,
) -> Figure:
"""
Update the Plotly figure by adding or removing its title accordingly.
If the provided title harbors only whitespace, the figure title is removed. Otherwise,
the new title is set and centered.
Parameters
----------
figure : plotly.graph_objects.Figure
The Plotly figure to modify.
title : str
The figure's title. If the str is empty or harbors only whitespace, no title is
added, helping to remove the title during Dash callbacks.
Returns
-------
fig : plotly.graph_objects.Figure
The updated Plotly figure.
"""
# check if title is empty or consists only of whitespace characters
if title.strip() == "":
fig.update_layout(title=None)
# Otherwise update figure title.
else:
fig.update_layout(
title=dict(
text=title,
x=0.5,
xanchor="center",
font=dict(size=30),
)
)
return fig
[docs]
def make_alignments(
input_files: list[Path], output_folder: Path
) -> tuple[DataFrame, DataFrame, DataFrame, DataFrame]:
"""
Perform BLASTn alignments between input sequences and return metadata for plotting.
This function processes GenBank files to extract DNA sequences, runs BLASTn locally
to identify alignments, and parses the results into structured Pandas DataFrames
for visualization.
Parameters
----------
input_files : list of pathlib.Path
List of file paths pointing to input GenBank files.
output_folder : pathlib.Path
Directory where temporary BLAST-related files will be written.
Returns
-------
gb_df : pandas.DataFrame
DataFrame with GenBank record metadata, including:
- file path, file name, sequence length, file number, and accession number.
Used to relate to CDS data (`cds_df`) by file number or accession.
cds_df : pandas.DataFrame
DataFrame with coding sequence (CDS) metadata for each GenBank file, including:
- gene name, product name, strand, start/end coordinates (both raw and for
plotting), file number, and accession number.
alignments_df : pandas.DataFrame
DataFrame summarizing BLASTn results per alignment, including:
- alignment number, query name, hit name, query length, and hit length.
regions_df : pandas.DataFrame
DataFrame describing individual regions of sequence similarity (homology)
between aligned sequences. Includes:
- alignment number (to relate to `alignments_df`) and region-level coordinates.
See `parse_blast_record` in the `gb_files_manipulation` module for details.
"""
# Create fasta files for BLASTing using the gb files
faa_files = genbank.make_fasta_files(input_files, output_folder)
# Run blastn locally to make alignments.
blast_xml_results = genbank.run_blastn(faa_files, output_folder)
# Make alignments and regions dataframes from blast results
alignments_df, regions_df = genbank.get_blast_metadata(blast_xml_results)
# Make GenBank records and coding sequences dataframes
gb_df, cds_df = genbank.genbank_files_metadata_to_dataframes(input_files)
# Delete the documents used for genereting BLASTn results.
misc.delete_files(faa_files)
misc.delete_files(blast_xml_results)
return gb_df, cds_df, alignments_df, regions_df