|
|
import os
|
|
|
import re
|
|
|
import time
|
|
|
import requests
|
|
|
import html2text
|
|
|
from urllib.parse import urljoin, urlparse
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
# Base URL of the documentation
|
|
|
BASE_URL = "https://docs.manim.community/en/stable/"
|
|
|
|
|
|
# Base directory to save the markdown files
|
|
|
OUTPUT_DIR = "docs_md"
|
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
|
|
|
|
# A session to reuse HTTP connections
|
|
|
session = requests.Session()
|
|
|
|
|
|
def is_valid_url(url):
|
|
|
"""
|
|
|
Only allow URLs that belong to the docs.manim.community/en/stable/ site.
|
|
|
"""
|
|
|
parsed = urlparse(url)
|
|
|
base_parsed = urlparse(BASE_URL)
|
|
|
return (parsed.scheme in ("http", "https") and
|
|
|
parsed.netloc == base_parsed.netloc and
|
|
|
parsed.path.startswith(base_parsed.path))
|
|
|
|
|
|
def url_to_local_path(url):
|
|
|
"""
|
|
|
Convert a URL into a local file path that preserves the URL’s folder structure.
|
|
|
|
|
|
For example, a URL ending with:
|
|
|
/_modules/manim/mobject/geometry/line.html
|
|
|
will be saved as:
|
|
|
docs_md/_modules/manim/mobject/geometry/line.html.md
|
|
|
"""
|
|
|
parsed = urlparse(url)
|
|
|
base_path = urlparse(BASE_URL).path
|
|
|
# Get the relative path after the base
|
|
|
rel_path = parsed.path[len(base_path):].lstrip("/")
|
|
|
if not rel_path:
|
|
|
rel_path = "index.html"
|
|
|
local_path = os.path.join(OUTPUT_DIR, rel_path)
|
|
|
# Ensure the file ends with .md (appending .md even if it ends with .html)
|
|
|
local_path += ".md"
|
|
|
return local_path
|
|
|
|
|
|
def convert_html_to_markdown(html_content):
|
|
|
"""
|
|
|
Convert HTML content to Markdown using html2text.
|
|
|
"""
|
|
|
h = html2text.HTML2Text()
|
|
|
h.ignore_links = False
|
|
|
h.body_width = 0 # do not wrap lines
|
|
|
return h.handle(html_content)
|
|
|
|
|
|
def crawl(url, visited):
|
|
|
"""
|
|
|
Recursively crawl the documentation pages starting from the given URL.
|
|
|
"""
|
|
|
if url in visited:
|
|
|
return
|
|
|
print(f"Processing: {url}")
|
|
|
visited.add(url)
|
|
|
|
|
|
try:
|
|
|
response = session.get(url)
|
|
|
response.raise_for_status()
|
|
|
except Exception as e:
|
|
|
print(f"Failed to get {url}: {e}")
|
|
|
return
|
|
|
|
|
|
html_content = response.text
|
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
|
|
|
|
# Extract only the first element with class "content"
|
|
|
content_div = soup.find(class_="content")
|
|
|
if content_div:
|
|
|
content_html = str(content_div)
|
|
|
else:
|
|
|
print(f"No content div found in {url}; using full page.")
|
|
|
content_html = html_content
|
|
|
|
|
|
markdown = convert_html_to_markdown(content_html)
|
|
|
|
|
|
# Determine the local file path and ensure its directory exists
|
|
|
local_path = url_to_local_path(url)
|
|
|
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
|
|
|
|
with open(local_path, "w", encoding="utf-8") as f:
|
|
|
f.write(markdown)
|
|
|
print(f"Saved markdown to {local_path}")
|
|
|
|
|
|
# Find and process links on the page
|
|
|
for link in soup.find_all("a", href=True):
|
|
|
href = link["href"]
|
|
|
full_url = urljoin(url, href)
|
|
|
full_url = full_url.split("#")[0] # remove any fragment identifier
|
|
|
if is_valid_url(full_url) and full_url not in visited:
|
|
|
time.sleep(0.1) # be polite with a short delay
|
|
|
crawl(full_url, visited)
|
|
|
|
|
|
def combine_markdown_files(root_dir, output_file):
|
|
|
"""
|
|
|
Recursively traverse root_dir and combine all .md files into one huge Markdown file.
|
|
|
A heading structure (with '#' characters) is added based on the folder hierarchy.
|
|
|
"""
|
|
|
with open(output_file, "w", encoding="utf-8") as out:
|
|
|
def process_dir(current_dir, level):
|
|
|
# Write a heading for the current directory (skip if we're at the root)
|
|
|
if os.path.abspath(current_dir) != os.path.abspath(root_dir):
|
|
|
dir_name = os.path.basename(current_dir)
|
|
|
out.write("\n" + "#" * level + " " + dir_name + "\n\n")
|
|
|
|
|
|
# Get sorted list of items
|
|
|
items = sorted(os.listdir(current_dir))
|
|
|
# Separate directories and markdown files
|
|
|
dirs = [i for i in items if os.path.isdir(os.path.join(current_dir, i))]
|
|
|
md_files = [i for i in items if os.path.isfile(os.path.join(current_dir, i)) and i.endswith(".md")]
|
|
|
|
|
|
# Process markdown files in the current directory
|
|
|
for md_file in md_files:
|
|
|
file_path = os.path.join(current_dir, md_file)
|
|
|
# Use a heading level one deeper than the directory
|
|
|
out.write("\n" + "#" * (level + 1) + " " + md_file + "\n\n")
|
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
|
out.write(f.read() + "\n\n")
|
|
|
|
|
|
# Recursively process subdirectories
|
|
|
for d in dirs:
|
|
|
process_dir(os.path.join(current_dir, d), level + 1)
|
|
|
|
|
|
process_dir(root_dir, 1)
|
|
|
print(f"Combined markdown saved to {output_file}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
visited = set()
|
|
|
crawl(BASE_URL, visited)
|
|
|
print("Download complete.")
|
|
|
|
|
|
# After crawling, combine all markdown files into one huge markdown file.
|
|
|
combined_output = "combined_docs.md"
|
|
|
combine_markdown_files(OUTPUT_DIR, combined_output)
|