Source code for templates

from mdvtools.mdvproject import MDVProject
from typing import Any

[docs]
prompt_data = """
Your task is to:  
1. Identify the type of data the user needs (e.g., categorical, numerical, etc.) by inspecting the DataFrames provided.
2. Use only the two DataFrames provided:
   - df1: cells (data_frame_obs)
   - df2: genes (data_frame_var)
3. Column selection logic:
   - For non-gene queries: select columns from df1 only. Inspect df1, using df1.columns
   - For gene-related queries (e.g., expression of a gene, comparison of genes, highest expressing genes):
       a. Use ONLY gene names from df2["name"] — do NOT use gene IDs or any other columns (e.g., df2["gene_ids"]).
       b. If a specific gene is mentioned by the user, check if it exists in df2["name"].
           - If it does not exist, assume the user provided a gene name and that df2["name"] may contain gene IDs instead.
                - Attempt to match the user-provided gene name to the corresponding gene ID using any available mapping logic (e.g., a lookup function or mapping dictionary).
                - If a corresponding gene ID is found in df2["name"], return that value.
           - If it exists, return it.
           - If no match is found, ignore the requested gene and instead select one or more gene names from df2["name"].
       c. If no gene is mentioned, select one or more gene names from df2["name"].
       d. Only use values from df2["name"] — do NOT use any other columns from df2.
4. Always return the list of required columns as a quoted comma-separated string, like:
   - "col1", "col2"
   - Or for gene-related: "col", "gene_name"   (make sure "col" is from df1) 
5. For gene-related queries:
   - Return both df1 columns and the selected gene name (from df2["name"]).
   - Only return the name as a string (e.g., "gene_name")—do not wrap it.
6. NEVER create new DataFrames or modify existing ones.
7. Ensure that the selected columns match the visualization requirements:  
    - Abundance Box plot: Requires three categorical columns.  
      - If only one categorical variable is available, return it three times.  
      - If two are available, return one of them twice.  
    - Box plot: Requires only one categorical column and one numerical column.  
    - Density Scatter plot: Requires two numerical columns and one categorical column.  
    - Dot plot: Requires only one categorical column and any number of numerical columns.  
    - Heatmap: Requires only one categorical column and any number of numerical columns.  
    - Histogram: Requires one numerical column.  
    - Multiline chart: Requires one numerical column and one categorical column.  
    - Pie Chart: Requires one categorical column.  
    - Row Chart: Requires one categorical column.  
    - Row summary box: Requires any column(s).  
    - Sankey plot: Requires two categorical columns.  
      - If only one categorical variable is available, return it twice.  
    - Scatter plot (2D): Requires two numerical columns and one any column for color.
    - Scatter plot (3D): Requires three numerical columns and one any column for color.  
    - Selection dialog plot: Requires any column.  
    - Stacked row chart: Requires two categorical columns.  
      - If only one categorical variable is available, return it twice.  
    - Table Plot: Requires any column(s).  
    - Text box: Requires no columns, just text.  
    - Violin plot: Requires only one categorical column and one numerical column.  
    - Wordcloud: Requires one categorical column.  
8. Important: Clearly separate the selected columns with quotes and commas.
9. The column names are case sensitive therefore return them as they are defined in the dataframe.
10. Output format:
   - First line: The word "fields" following by quoted, comma-separated list of column names.
   - Second line: The word "charts" following by quoted, comma-separated list of suitable chart types for the selected columns.
11. NEVER explain your reasoning.
"""



[docs]
packages_functions = """import os
import pandas as pd
import scanpy as sc
from mdvtools.mdvproject import MDVProject
from mdvtools.conversions import convert_scanpy_to_mdv
from mdvtools.charts.density_scatter_plot import DensityScatterPlot
from mdvtools.charts.heatmap_plot import HeatmapPlot
from mdvtools.charts.histogram_plot import HistogramPlot
from mdvtools.charts.dot_plot import DotPlot
from mdvtools.charts.box_plot import BoxPlot
from mdvtools.charts.scatter_plot_3D import ScatterPlot3D
from mdvtools.charts.row_chart import RowChart
from mdvtools.charts.scatter_plot import ScatterPlot
from mdvtools.charts.abundance_box_plot import AbundanceBoxPlot
from mdvtools.charts.stacked_row_plot import StackedRowChart
from mdvtools.charts.ring_chart import RingChart
from mdvtools.charts.pie_chart import PieChart
from mdvtools.charts.violin_plot import ViolinPlot
from mdvtools.charts.multi_line_plot import MultiLinePlot
from mdvtools.charts.table_plot import TablePlot
from mdvtools.charts.wordcloud_plot import WordcloudPlot
from mdvtools.charts.text_box_plot import TextBox
from mdvtools.charts.row_summary_box_plot import RowSummaryBox
from mdvtools.charts.selection_dialog_plot import SelectionDialogPlot
from mdvtools.charts.sankey_plot import SankeyPlot

import json
import numpy as np
import sys
"""

# def load_data(path):
#     #Load data from the specified CSV file.
#     return pd.read_csv(path, low_memory=False)

# def convert_plot_to_json(plot):
#     #Convert plot data to JSON format.
#     return json.loads(json.dumps(plot.plot_data, indent=2).replace("\\\\", ""))




[docs]
def get_createproject_prompt_RAG(project: MDVProject, path_to_data: str, datasource_name: str, final_answer: str, question: str) -> str:
    """
    Constructs a RAG prompt to guide LLM code generation for creating MDV plots.
    Handles both standard and gene-related queries.
    """
    prompt_RAG = (
    """
    Context: {context}

    The provided scripts demonstrate how to generate various data visualizations using the `mdvtools` library in Python.

    Each script follows this standard workflow:

    1. Setup:
        - Initialize an MDVProject instance using the method: MDVProject(project_path, delete_existing=True).
        - Use `scanpy.read_h5ad(data_path)` to load the AnnData object.

    2. Data Loading:
        - Extract `adata.obs` into `data_frame_obs` (cell-level info).
        - Extract `adata.var` into `data_frame_var` (gene-level info).
        - Add a `name` column to `data_frame_var`: `adata.var_names.to_list()`

    3. Datasource Registration:
        - Add data to the MDV project using:
            ```python
            project.add_datasource(datasource_name, data_frame_obs)
            project.add_datasource(datasource_name_2, data_frame_var)
            ```

    4. Plot Construction:
        - Use a chart class (e.g., DotPlot, BoxPlot, SelectionDialogPlot) and set `params = [...]` using selected fields.
            - The fields and chart type are given by """+final_answer+"""
        - Convert the chart to JSON using `convert_plot_to_json(plot)`
        - Set the view using `project.set_view(view_name, view_object)`

    5. Parameter Handling:
        - The string """+final_answer+""" specifies the field names to use in the `params` list and the chart type to use.
        - For parameters from `data_frame_obs` (cell-level), use them as-is.
        - For gene expression values from `data_frame_var`, use this syntax:
            ```python
            param = "GENE_NAME"
            param_index = data_frame_var['name'].tolist().index(param)
            f"gs|{{{{param}}}}(gs)|{{{{param_index}}}}"
            ```

    6. Gene-Related Queries:
        If the question involves gene expression, expression comparison, or refers to gene names:
        - Load both `cells` and `genes` datasources.
        - Wrap gene names (from `data_frame_var`) using the syntax above.
        - Only wrap genes—do not apply `get_loc()` or `index` on `data_frame_obs` fields.

    7. Queries requiring subsetting of the dataset:
        If to answer the question requires a subset of the data or filtering the data, make sure to:
        - Add a selection dialog plot with all the parameters that were passed on as params. Make sure it has a title.

    8. Always add a selection dialog plot along the other charts. It must have all the parameters that were passed on as params. It must have a title.

    9. Your Task:
        - Interpret the user question and decide based on the question which graph needs to be plotted: """+question+final_answer+"""
        - Use the fields in the """+final_answer+""" as params appropriately:
            - Wrap only gene names as shown.
            - Use others directly. Fields are case sensitive.
        - Use formatted f-strings for all dynamic strings.
        - Generate a valid Python script that creates and visualizes the appropriate chart using the MDVProject framework.
        - Update these variables with these values:
            - project_path = '"""+project.dir+"""'
            - data_path = '"""+path_to_data+"""'
            - view_name = a string, in double quotes, describing what is being visualized.
            - datasource_name = '"""+datasource_name+"""'
        - The possible charts are given by """+final_answer+""" and should follow the following visualisation guidelines for each type of chart:
            - Abundance Box plot: Requires three categorical columns.  
                - If only one categorical variable is available, use it three times.  
                - If two are available, use one of them twice.  
            - Box plot: Requires only one categorical column and one numerical column.  
            - Density Scatter plot: Requires two numerical columns and one categorical column.  
            - Dot plot: Requires only one categorical column and any number of numerical columns.  
            - Heatmap: Requires only one categorical column and any number of numerical columns.  
            - Histogram: Requires one numerical column.  
            - Multiline chart: Requires one numerical column and one categorical column.  
            - Pie Chart: Requires one categorical column.  
            - Row Chart: Requires one categorical column.  
            - Row summary box: Requires any column(s).  
            - Sankey plot: Requires two categorical columns.  
                - If only one categorical variable is available, use it twice.  
            - Scatter plot (2D): Requires two numerical columns and one any column for color.
            - Scatter plot (3D): Requires three numerical columns and one any column for color.  
            - Selection dialog plot: Requires any column.  
            - Stacked row chart: Requires two categorical columns.  
                - If only one categorical variable is available, use it twice.  
            - Table Plot: Requires any column(s).  
            - Text box: Requires no columns, just text.  
            - Violin plot: Requires only one categorical column and one numerical column.  
            - Wordcloud: Requires one categorical column.
    Output format: Only return the python code that is to be run to generate the charts.

    
"""
    )
    return prompt_RAG