from mdvtools.mdvproject import MDVProject
from typing import Optional
import json
[docs]
def create_error_markdown(message: str, traceback: Optional[str] = None, extra_metadata: Optional[dict] = None) -> str:
markdown = f"**Error:** {escape_markdown(message)}\n\n"
if traceback:
# Use HTML details tag for collapsable section
markdown += f"<details><summary>Traceback</summary>\n\n```\n{traceback}\n```\n\n</details>\n\n"
if extra_metadata:
markdown += f"<details><summary>Extra Metadata</summary>\n\n```json\n{json.dumps(extra_metadata, indent=2)}\n```\n\n</details>\n\n"
return markdown
[docs]
def create_project_markdown(project: MDVProject) -> str:
markdown = "\n\n<details>\n\n"
markdown += f"<summary>Project Datasources</summary>\n\n"
for name in project.get_datasource_names():
ds = project.get_datasource_metadata(name)
markdown += f"## **{name}:** ({ds['size']} rows)\n\n"
# todo - add a summary of the data here
markdown += create_column_markdown(ds["columns"])
markdown += "\n\n</details>\n\n"
return markdown
[docs]
def escape_markdown(text: str) -> str:
"""
Escape special markdown characters.
In future we may want more advanced markdown functionality which may warrant
adding a markdown library.
"""
special_chars = "\\`*_{}[]()#+-.!<>|"
for char in special_chars:
text = text.replace(char, "\\" + char)
return text
[docs]
def create_column_markdown(cols: list[dict]) -> str:
numeric_cols = []
categorical_cols = []
other_cols = []
for c in cols:
dt = c.get('datatype')
if dt in ['integer', 'double', 'int32']:
numeric_cols.append(c)
elif dt in ['text', 'text16', 'multitext', 'unique']:
categorical_cols.append(c)
else:
other_cols.append(c)
markdown = ""
if categorical_cols:
markdown += "\n### Categorical Columns\n"
markdown += "| Column Name | Data Type | Values (sample) |\n|---|---|---|\n"
for col in categorical_cols:
col_name = escape_markdown(col.get('name', ''))
datatype = col.get('datatype', '')
values_str = ""
if col.get('values'):
sample_values = col['values'][:5]
escaped_values = [escape_markdown(str(v)) for v in sample_values]
values_str = ", ".join(escaped_values)
if len(col['values']) > 5:
values_str += ", ..."
markdown += f"| {col_name} | {datatype} | {values_str} |\n"
if numeric_cols:
markdown += "\n### Numeric Columns\n"
markdown += "| Column Name | Data Type | Min / Max | Quantiles (0.05) |\n|---|---|---|---|\n"
for col in numeric_cols:
col_name = escape_markdown(col.get('name', ''))
datatype = col.get('datatype', '')
min_max = col.get('minMax')
min_max_str = f"{min_max[0]} / {min_max[1]}" if min_max and len(min_max) == 2 else ""
quantiles = col.get('quantiles', {})
q05 = quantiles.get('0.05')
quantiles_str = f"[{q05[0]}, {q05[1]}]" if q05 else ""
markdown += f"| {col_name} | {datatype} | {min_max_str} | {quantiles_str} |\n"
if other_cols:
markdown += "\n### Other Columns\n"
markdown += "| Column Name | Data Type |\n|---|---|\n"
for col in other_cols:
col_name = escape_markdown(col.get('name', ''))
datatype = col.get('datatype', 'N/A')
markdown += f"| {col_name} | {datatype} |\n"
if not (numeric_cols or categorical_cols or other_cols):
markdown = "No columns found."
return f"{markdown}\n\n"
[docs]
chart_types_md = """
- Abundance Box Plot
- Box Plot
- Density Scatter Plot
- Dot Plot
- Heat Map
- Histogram Plot
- Multi Line Chart
- Pie Chart
- Row Chart
- Row Summary Box
- Sankey Diagram
- Stacked Row Chart (Categorical Heatmap)
- Table
- Text Box
- 2D Scatter Plot
- Violin Plot
- Word Cloud
"""
[docs]
example_intents_md = """
- "distribution", "spread" = Histogram Plot, Box Plot, Violin Plot
- "relationship", "correlation" = Scatter Plot, Density Scatter, Heat Map
- "comparison", "difference", "change" = Box Plot, Violin Plot, Multi Line Chart, Dot Plot
- "composition", "proportion", "breakdown" = Pie Chart, Stacked Row Chart, Row Chart
- "over time", "trend", "temporal" = Line Chart, Multi Line Chart
- "expression", "gene", "marker" = Dot Plot, Heat Map, Box Plot
- "spatial", "location", "embedding" = 2D Scatter Plot, Density Scatter Plot
- "flow", "transition" = Sankey Diagram
- "metadata", "category", "annotation" = Table, Row Summary Box, Row Chart
- "filter", "subset", "select" = Selection Dialog Plot
"""
[docs]
def create_suggested_questions_prompt(project: MDVProject) -> str:
return f"""
You are an expert in the data in this project.
You are given a project with the following datasources:
{create_project_markdown(project)}
Given this data, generate a list of 5 biologically relevant questions we could visualise.
When phrasing the questions, only refer to column names found in that project data, and only suggest visualisations using these charts:
{chart_types_md}
## Example intents:
{example_intents_md}
If a query needs multiple charts to be plotted, show them as individual queries.
For example: What is the distribution of age_or_mean_of_age_range or BMI among different disease categories?
Chart: Histogram Plot, Box Plot, Violin Plot should be What is the distribution of age_or_mean_of_age_range
among different disease categories? Violin Plot to keep the visualisation simple.
Each question should be a single, complete sentence.
"""