145 lines
5.0 KiB
Python
145 lines
5.0 KiB
Python
import os
|
||
import re
|
||
import time
|
||
import requests
|
||
import html2text
|
||
from urllib.parse import urljoin, urlparse
|
||
from bs4 import BeautifulSoup
|
||
|
||
# Base URL of the documentation
|
||
BASE_URL = "https://docs.manim.community/en/stable/"
|
||
|
||
# Base directory to save the markdown files
|
||
OUTPUT_DIR = "docs_md"
|
||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||
|
||
# A session to reuse HTTP connections
|
||
session = requests.Session()
|
||
|
||
def is_valid_url(url):
|
||
"""
|
||
Only allow URLs that belong to the docs.manim.community/en/stable/ site.
|
||
"""
|
||
parsed = urlparse(url)
|
||
base_parsed = urlparse(BASE_URL)
|
||
return (parsed.scheme in ("http", "https") and
|
||
parsed.netloc == base_parsed.netloc and
|
||
parsed.path.startswith(base_parsed.path))
|
||
|
||
def url_to_local_path(url):
|
||
"""
|
||
Convert a URL into a local file path that preserves the URL’s folder structure.
|
||
|
||
For example, a URL ending with:
|
||
/_modules/manim/mobject/geometry/line.html
|
||
will be saved as:
|
||
docs_md/_modules/manim/mobject/geometry/line.html.md
|
||
"""
|
||
parsed = urlparse(url)
|
||
base_path = urlparse(BASE_URL).path
|
||
# Get the relative path after the base
|
||
rel_path = parsed.path[len(base_path):].lstrip("/")
|
||
if not rel_path:
|
||
rel_path = "index.html"
|
||
local_path = os.path.join(OUTPUT_DIR, rel_path)
|
||
# Ensure the file ends with .md (appending .md even if it ends with .html)
|
||
local_path += ".md"
|
||
return local_path
|
||
|
||
def convert_html_to_markdown(html_content):
|
||
"""
|
||
Convert HTML content to Markdown using html2text.
|
||
"""
|
||
h = html2text.HTML2Text()
|
||
h.ignore_links = False
|
||
h.body_width = 0 # do not wrap lines
|
||
return h.handle(html_content)
|
||
|
||
def crawl(url, visited):
|
||
"""
|
||
Recursively crawl the documentation pages starting from the given URL.
|
||
"""
|
||
if url in visited:
|
||
return
|
||
print(f"Processing: {url}")
|
||
visited.add(url)
|
||
|
||
try:
|
||
response = session.get(url)
|
||
response.raise_for_status()
|
||
except Exception as e:
|
||
print(f"Failed to get {url}: {e}")
|
||
return
|
||
|
||
html_content = response.text
|
||
soup = BeautifulSoup(html_content, "html.parser")
|
||
|
||
# Extract only the first element with class "content"
|
||
content_div = soup.find(class_="content")
|
||
if content_div:
|
||
content_html = str(content_div)
|
||
else:
|
||
print(f"No content div found in {url}; using full page.")
|
||
content_html = html_content
|
||
|
||
markdown = convert_html_to_markdown(content_html)
|
||
|
||
# Determine the local file path and ensure its directory exists
|
||
local_path = url_to_local_path(url)
|
||
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
||
|
||
with open(local_path, "w", encoding="utf-8") as f:
|
||
f.write(markdown)
|
||
print(f"Saved markdown to {local_path}")
|
||
|
||
# Find and process links on the page
|
||
for link in soup.find_all("a", href=True):
|
||
href = link["href"]
|
||
full_url = urljoin(url, href)
|
||
full_url = full_url.split("#")[0] # remove any fragment identifier
|
||
if is_valid_url(full_url) and full_url not in visited:
|
||
time.sleep(0.1) # be polite with a short delay
|
||
crawl(full_url, visited)
|
||
|
||
def combine_markdown_files(root_dir, output_file):
|
||
"""
|
||
Recursively traverse root_dir and combine all .md files into one huge Markdown file.
|
||
A heading structure (with '#' characters) is added based on the folder hierarchy.
|
||
"""
|
||
with open(output_file, "w", encoding="utf-8") as out:
|
||
def process_dir(current_dir, level):
|
||
# Write a heading for the current directory (skip if we're at the root)
|
||
if os.path.abspath(current_dir) != os.path.abspath(root_dir):
|
||
dir_name = os.path.basename(current_dir)
|
||
out.write("\n" + "#" * level + " " + dir_name + "\n\n")
|
||
|
||
# Get sorted list of items
|
||
items = sorted(os.listdir(current_dir))
|
||
# Separate directories and markdown files
|
||
dirs = [i for i in items if os.path.isdir(os.path.join(current_dir, i))]
|
||
md_files = [i for i in items if os.path.isfile(os.path.join(current_dir, i)) and i.endswith(".md")]
|
||
|
||
# Process markdown files in the current directory
|
||
for md_file in md_files:
|
||
file_path = os.path.join(current_dir, md_file)
|
||
# Use a heading level one deeper than the directory
|
||
out.write("\n" + "#" * (level + 1) + " " + md_file + "\n\n")
|
||
with open(file_path, "r", encoding="utf-8") as f:
|
||
out.write(f.read() + "\n\n")
|
||
|
||
# Recursively process subdirectories
|
||
for d in dirs:
|
||
process_dir(os.path.join(current_dir, d), level + 1)
|
||
|
||
process_dir(root_dir, 1)
|
||
print(f"Combined markdown saved to {output_file}")
|
||
|
||
if __name__ == "__main__":
|
||
visited = set()
|
||
crawl(BASE_URL, visited)
|
||
print("Download complete.")
|
||
|
||
# After crawling, combine all markdown files into one huge markdown file.
|
||
combined_output = "combined_docs.md"
|
||
combine_markdown_files(OUTPUT_DIR, combined_output)
|