Source code for github_utils
from dotenv import load_dotenv
import requests
import os
import time
import nbformat
load_dotenv()
[docs]
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")
[docs]
GITHUB_REPO = "Taylor-CCB-Group/MDV" # @param {type:"string"}
# BRANCH_NAME = "mk-API"
[docs]
COMMIT_HASH = "af4192b" # latest commit by mk as of this writing
[docs]
PROJECT_PATH_1 = "python/mdvtools/charts"
[docs]
PROJECT_PATH_2 = "python/mdvtools/test_projects"
[docs]
def crawl_github_repo(
url: str = GITHUB_REPO,
is_sub_dir: bool = False,
branch_or_commit_name: str = COMMIT_HASH,
project_path: str = PROJECT_PATH_2,
access_token=f"{GITHUB_TOKEN}",
):
"""
Crawls a GitHub repository to retrieve file URLs based on specified criteria.
Args:
url (str): The GitHub repository URL or sub-directory URL.
is_sub_dir (bool): Flag indicating if the current URL is a sub-directory.
branch_name (str): The branch name to crawl.
project_path (str): The path of the project in the repository.
access_token (str, optional): GitHub access token for authentication. Defaults to GITHUB_TOKEN.
Returns:
list: List of file URLs that match the criteria.
"""
# List of files to ignore
ignore_list = ["__init__.py", "pbmc3k_tutorial.ipynb", "pbmc3k_tutorial.py"]
# Determine the appropriate API URL based on whether it's a sub-directory
if not is_sub_dir:
api_url = f"https://api.github.com/repos/{url}/contents/{project_path}?ref={branch_or_commit_name}"
else:
api_url = url
# Set up headers for the GitHub API request, including authorization
headers = {
"Accept": "application/vnd.github.v3+json",
"Authorization": f"Bearer {access_token}",
}
# Make a GET request to the GitHub API
response = requests.get(api_url, headers=headers)
# Raise an exception for any request errors
response.raise_for_status()
# Initialize an empty list to store file URLs
files = []
# Parse the JSON response content
contents = response.json()
# Iterate over the items in the contents
for item in contents:
# Check if the item is a file and meets the criteria for inclusion
if (
item["type"] == "file"
and item["name"] not in ignore_list
and (item["name"].endswith(".py") or item["name"].endswith(".ipynb"))
):
files.append(item["html_url"])
# Check if the item is a directory (excluding hidden ones)
elif item["type"] == "dir" and not item["name"].startswith("."):
# Recursively crawl the sub-directory
sub_files = crawl_github_repo(item["url"], True, branch_or_commit_name, project_path)
# Pause briefly to avoid rate limiting
time.sleep(0.1)
# Add the sub-directory files to the list
files.extend(sub_files)
# Return the list of collected file URLs
return files
# Extracts the Python code from a .ipynb (Jupyter Notebook) file from GitHub
# Extracts the Python code from a .py file from GitHub