Source code for github_utils

from dotenv import load_dotenv
import requests
import os
import time
import nbformat

load_dotenv()


[docs]
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")



[docs]
GITHUB_REPO = "Taylor-CCB-Group/MDV"  # @param {type:"string"}

# BRANCH_NAME = "mk-API"

[docs]
COMMIT_HASH = "af4192b" # latest commit by mk as of this writing


[docs]
PROJECT_PATH_1 = "python/mdvtools/charts"


[docs]
PROJECT_PATH_2 = "python/mdvtools/test_projects"




[docs]
def crawl_github_repo(
    url: str = GITHUB_REPO,
    is_sub_dir: bool = False,
    branch_or_commit_name: str = COMMIT_HASH,
    project_path: str = PROJECT_PATH_2,
    access_token=f"{GITHUB_TOKEN}",
):
    """
    Crawls a GitHub repository to retrieve file URLs based on specified criteria.

    Args:
        url (str): The GitHub repository URL or sub-directory URL.
        is_sub_dir (bool): Flag indicating if the current URL is a sub-directory.
        branch_name (str): The branch name to crawl.
        project_path (str): The path of the project in the repository.
        access_token (str, optional): GitHub access token for authentication. Defaults to GITHUB_TOKEN.

    Returns:
        list: List of file URLs that match the criteria.
    """

    # List of files to ignore
    ignore_list = ["__init__.py", "pbmc3k_tutorial.ipynb", "pbmc3k_tutorial.py"]

    # Determine the appropriate API URL based on whether it's a sub-directory
    if not is_sub_dir:
        api_url = f"https://api.github.com/repos/{url}/contents/{project_path}?ref={branch_or_commit_name}"
    else:
        api_url = url

    # Set up headers for the GitHub API request, including authorization
    headers = {
        "Accept": "application/vnd.github.v3+json",
        "Authorization": f"Bearer {access_token}",
    }

    # Make a GET request to the GitHub API
    response = requests.get(api_url, headers=headers)
    # Raise an exception for any request errors
    response.raise_for_status()

    # Initialize an empty list to store file URLs
    files = []

    # Parse the JSON response content
    contents = response.json()

    # Iterate over the items in the contents
    for item in contents:
        # Check if the item is a file and meets the criteria for inclusion
        if (
            item["type"] == "file"
            and item["name"] not in ignore_list
            and (item["name"].endswith(".py") or item["name"].endswith(".ipynb"))
        ):
            files.append(item["html_url"])
        # Check if the item is a directory (excluding hidden ones)
        elif item["type"] == "dir" and not item["name"].startswith("."):
            # Recursively crawl the sub-directory
            sub_files = crawl_github_repo(item["url"], True, branch_or_commit_name, project_path)
            # Pause briefly to avoid rate limiting
            time.sleep(0.1)
            # Add the sub-directory files to the list
            files.extend(sub_files)

    # Return the list of collected file URLs
    return files



# Extracts the Python code from a .ipynb (Jupyter Notebook) file from GitHub

[docs]
def extract_python_code_from_ipynb(github_url: str, cell_type="code"):
    # Convert the GitHub URL to the raw content URL
    raw_url = github_url.replace("github.com", "raw.githubusercontent.com").replace(
        "/blob/", "/"
    )

    # Make a GET request to fetch the raw content of the notebook
    response = requests.get(raw_url)
    response.raise_for_status()  # Check for any request errors

    # Get the content of the notebook as text
    notebook_content = response.text

    # Read the notebook content using nbformat
    notebook = nbformat.reads(notebook_content, as_version=nbformat.NO_CONVERT)

    # Initialize a variable to store the extracted Python code
    python_code = None

    # Iterate over the cells in the notebook
    for cell in notebook.cells:
        # Check if the cell type matches the specified type
        if cell.cell_type == cell_type:
            # Append the cell's source code to the python_code variable
            if not python_code:
                python_code = cell.source
            else:
                python_code += "\n" + cell.source

    # Return the extracted Python code
    return python_code



# Extracts the Python code from a .py file from GitHub

[docs]
def extract_python_code_from_py(github_url):
    # Convert the GitHub URL to the raw content URL
    raw_url = github_url.replace("github.com", "raw.githubusercontent.com").replace(
        "/blob/", "/"
    )

    # Make a GET request to fetch the raw content of the Python file
    response = requests.get(raw_url)
    response.raise_for_status()  # Check for any request errors

    # Get the content of the Python file as text
    python_code = response.text
    # print(python_code)

    # Return the extracted Python code
    return python_code