import requests
from bs4 import BeautifulSoup
def download_sitemap(url):
response = requests.get(url)
response.raise_for_status()
return response.text
def parse_sitemap(sitemap_xml, base_domain):
soup = BeautifulSoup(sitemap_xml, 'xml')
urls = soup.find_all('loc')
page_structure = {base_domain: {}} # Initialize with base domain
def insert_into_structure(structure, parts):
if len(parts) == 0: # Base case: No more parts
return
if parts[0] not in structure:
structure[parts[0]] = {} # Initialize a new sub-structure
insert_into_structure(structure[parts[0]], parts[1:]) # Recursive call for the next part
for url in urls:
url_text = url.text.strip()
url_parts = url_text.replace('https://www.', '').replace('http://www.', '').replace('https://', '').replace('http://', '').split('/')[1:] # Split path
insert_into_structure(page_structure[base_domain], url_parts)
return page_structure
def capitalize_first_letter(text):
"""Capitalizes the first letter of each segment in a URL path."""
return '/'.join(segment.capitalize() for segment in text.split('/'))
def generate_markdown(structure, base_url, parent_path='', output_file='output.md', prefix=''):
lines = []
for page, subpages in structure.items():
# Adjust for whether this is the base domain or a subpage
if page != base_url:
page_path = f"{parent_path}/{page}" if parent_path else page
page_display = capitalize_first_letter(page)
full_url = f"https://{base_url}/{page_path}"
else:
page_display = base_url.capitalize()
full_url = f"https://{base_url}"
page_path = ''
line = f"{prefix}- [{page_display}]({full_url})"
lines.append(line)
if subpages: # If there are subpages, recursively generate their lines with increased indentation
lines.extend(generate_markdown(subpages, base_url, page_path, output_file=None, prefix=prefix + " "))
if output_file:
with open(output_file, 'w') as f:
f.write('\n'.join(lines))
else:
return lines
base_domain = 'blue.cc'
sitemap_url = f'https://{base_domain}/sitemap.xml' # Your sitemap URL
sitemap_xml = download_sitemap(sitemap_url)
page_structure = parse_sitemap(sitemap_xml, base_domain)
generate_markdown(page_structure, base_domain)
print(f"Markdown file generated.")