import os import re import time import requests import html2text from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup # Base URL of the documentation BASE_URL = "https://docs.manim.community/en/stable/" # Base directory to save the markdown files OUTPUT_DIR = "docs_md" os.makedirs(OUTPUT_DIR, exist_ok=True) # A session to reuse HTTP connections session = requests.Session() def is_valid_url(url): """ Only allow URLs that belong to the docs.manim.community/en/stable/ site. """ parsed = urlparse(url) base_parsed = urlparse(BASE_URL) return (parsed.scheme in ("http", "https") and parsed.netloc == base_parsed.netloc and parsed.path.startswith(base_parsed.path)) def url_to_local_path(url): """ Convert a URL into a local file path that preserves the URL’s folder structure. For example, a URL ending with: /_modules/manim/mobject/geometry/line.html will be saved as: docs_md/_modules/manim/mobject/geometry/line.html.md """ parsed = urlparse(url) base_path = urlparse(BASE_URL).path # Get the relative path after the base rel_path = parsed.path[len(base_path):].lstrip("/") if not rel_path: rel_path = "index.html" local_path = os.path.join(OUTPUT_DIR, rel_path) # Ensure the file ends with .md (appending .md even if it ends with .html) local_path += ".md" return local_path def convert_html_to_markdown(html_content): """ Convert HTML content to Markdown using html2text. """ h = html2text.HTML2Text() h.ignore_links = False h.body_width = 0 # do not wrap lines return h.handle(html_content) def crawl(url, visited): """ Recursively crawl the documentation pages starting from the given URL. """ if url in visited: return print(f"Processing: {url}") visited.add(url) try: response = session.get(url) response.raise_for_status() except Exception as e: print(f"Failed to get {url}: {e}") return html_content = response.text soup = BeautifulSoup(html_content, "html.parser") # Extract only the first element with class "content" content_div = soup.find(class_="content") if content_div: content_html = str(content_div) else: print(f"No content div found in {url}; using full page.") content_html = html_content markdown = convert_html_to_markdown(content_html) # Determine the local file path and ensure its directory exists local_path = url_to_local_path(url) os.makedirs(os.path.dirname(local_path), exist_ok=True) with open(local_path, "w", encoding="utf-8") as f: f.write(markdown) print(f"Saved markdown to {local_path}") # Find and process links on the page for link in soup.find_all("a", href=True): href = link["href"] full_url = urljoin(url, href) full_url = full_url.split("#")[0] # remove any fragment identifier if is_valid_url(full_url) and full_url not in visited: time.sleep(0.1) # be polite with a short delay crawl(full_url, visited) def combine_markdown_files(root_dir, output_file): """ Recursively traverse root_dir and combine all .md files into one huge Markdown file. A heading structure (with '#' characters) is added based on the folder hierarchy. """ with open(output_file, "w", encoding="utf-8") as out: def process_dir(current_dir, level): # Write a heading for the current directory (skip if we're at the root) if os.path.abspath(current_dir) != os.path.abspath(root_dir): dir_name = os.path.basename(current_dir) out.write("\n" + "#" * level + " " + dir_name + "\n\n") # Get sorted list of items items = sorted(os.listdir(current_dir)) # Separate directories and markdown files dirs = [i for i in items if os.path.isdir(os.path.join(current_dir, i))] md_files = [i for i in items if os.path.isfile(os.path.join(current_dir, i)) and i.endswith(".md")] # Process markdown files in the current directory for md_file in md_files: file_path = os.path.join(current_dir, md_file) # Use a heading level one deeper than the directory out.write("\n" + "#" * (level + 1) + " " + md_file + "\n\n") with open(file_path, "r", encoding="utf-8") as f: out.write(f.read() + "\n\n") # Recursively process subdirectories for d in dirs: process_dir(os.path.join(current_dir, d), level + 1) process_dir(root_dir, 1) print(f"Combined markdown saved to {output_file}") if __name__ == "__main__": visited = set() crawl(BASE_URL, visited) print("Download complete.") # After crawling, combine all markdown files into one huge markdown file. combined_output = "combined_docs.md" combine_markdown_files(OUTPUT_DIR, combined_output)