You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

145 lines
5.0 KiB

3 months ago
import os
import re
import time
import requests
import html2text
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
# Base URL of the documentation
BASE_URL = "https://docs.manim.community/en/stable/"
# Base directory to save the markdown files
OUTPUT_DIR = "docs_md"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# A session to reuse HTTP connections
session = requests.Session()
def is_valid_url(url):
"""
Only allow URLs that belong to the docs.manim.community/en/stable/ site.
"""
parsed = urlparse(url)
base_parsed = urlparse(BASE_URL)
return (parsed.scheme in ("http", "https") and
parsed.netloc == base_parsed.netloc and
parsed.path.startswith(base_parsed.path))
def url_to_local_path(url):
"""
Convert a URL into a local file path that preserves the URLs folder structure.
For example, a URL ending with:
/_modules/manim/mobject/geometry/line.html
will be saved as:
docs_md/_modules/manim/mobject/geometry/line.html.md
"""
parsed = urlparse(url)
base_path = urlparse(BASE_URL).path
# Get the relative path after the base
rel_path = parsed.path[len(base_path):].lstrip("/")
if not rel_path:
rel_path = "index.html"
local_path = os.path.join(OUTPUT_DIR, rel_path)
# Ensure the file ends with .md (appending .md even if it ends with .html)
local_path += ".md"
return local_path
def convert_html_to_markdown(html_content):
"""
Convert HTML content to Markdown using html2text.
"""
h = html2text.HTML2Text()
h.ignore_links = False
h.body_width = 0 # do not wrap lines
return h.handle(html_content)
def crawl(url, visited):
"""
Recursively crawl the documentation pages starting from the given URL.
"""
if url in visited:
return
print(f"Processing: {url}")
visited.add(url)
try:
response = session.get(url)
response.raise_for_status()
except Exception as e:
print(f"Failed to get {url}: {e}")
return
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
# Extract only the first element with class "content"
content_div = soup.find(class_="content")
if content_div:
content_html = str(content_div)
else:
print(f"No content div found in {url}; using full page.")
content_html = html_content
markdown = convert_html_to_markdown(content_html)
# Determine the local file path and ensure its directory exists
local_path = url_to_local_path(url)
os.makedirs(os.path.dirname(local_path), exist_ok=True)
with open(local_path, "w", encoding="utf-8") as f:
f.write(markdown)
print(f"Saved markdown to {local_path}")
# Find and process links on the page
for link in soup.find_all("a", href=True):
href = link["href"]
full_url = urljoin(url, href)
full_url = full_url.split("#")[0] # remove any fragment identifier
if is_valid_url(full_url) and full_url not in visited:
time.sleep(0.1) # be polite with a short delay
crawl(full_url, visited)
def combine_markdown_files(root_dir, output_file):
"""
Recursively traverse root_dir and combine all .md files into one huge Markdown file.
A heading structure (with '#' characters) is added based on the folder hierarchy.
"""
with open(output_file, "w", encoding="utf-8") as out:
def process_dir(current_dir, level):
# Write a heading for the current directory (skip if we're at the root)
if os.path.abspath(current_dir) != os.path.abspath(root_dir):
dir_name = os.path.basename(current_dir)
out.write("\n" + "#" * level + " " + dir_name + "\n\n")
# Get sorted list of items
items = sorted(os.listdir(current_dir))
# Separate directories and markdown files
dirs = [i for i in items if os.path.isdir(os.path.join(current_dir, i))]
md_files = [i for i in items if os.path.isfile(os.path.join(current_dir, i)) and i.endswith(".md")]
# Process markdown files in the current directory
for md_file in md_files:
file_path = os.path.join(current_dir, md_file)
# Use a heading level one deeper than the directory
out.write("\n" + "#" * (level + 1) + " " + md_file + "\n\n")
with open(file_path, "r", encoding="utf-8") as f:
out.write(f.read() + "\n\n")
# Recursively process subdirectories
for d in dirs:
process_dir(os.path.join(current_dir, d), level + 1)
process_dir(root_dir, 1)
print(f"Combined markdown saved to {output_file}")
if __name__ == "__main__":
visited = set()
crawl(BASE_URL, visited)
print("Download complete.")
# After crawling, combine all markdown files into one huge markdown file.
combined_output = "combined_docs.md"
combine_markdown_files(OUTPUT_DIR, combined_output)