#!/usr/bin/env python3
"""
Clean and simplify markdown for general use
"""

import re
import sys
import os

def remove_complex_formatting(text):
    """Remove complex markdown formatting"""
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove footnotes
    text = re.sub(r'\[\^[^\]]+\]', '', text)
    
    # Remove reference-style links
    text = re.sub(r'\[[^\]]+\]:\s*.+', '', text, flags=re.MULTILINE)
    
    # Remove definition lists
    text = re.sub(r'^:\s+.+$', '', text, flags=re.MULTILINE)
    
    return text

def simplify_headers(text):
    """Simplify headers to bold text"""
    # Convert # Header to **Header**
    text = re.sub(r'^#\s+(.+)$', r'**\1**', text, flags=re.MULTILINE)
    text = re.sub(r'^##\s+(.+)$', r'**\1**', text, flags=re.MULTILINE)
    text = re.sub(r'^###\s+(.+)$', r'*\1*', text, flags=re.MULTILINE)
    text = re.sub(r'^####\s+(.+)$', r'*\1*', text, flags=re.MULTILINE)
    
    # Remove remaining headers (####### etc)
    text = re.sub(r'^#{4,}\s+.+$', '', text, flags=re.MULTILINE)
    
    return text

def simplify_lists(text):
    """Simplify lists to bullet points"""
    # Convert all list types to bullet points
    text = re.sub(r'^\d+\.\s+', '• ', text, flags=re.MULTILINE)
    text = re.sub(r'^-\s+', '• ', text, flags=re.MULTILINE)
    text = re.sub(r'^\*\s+', '• ', text, flags=re.MULTILINE)
    text = re.sub(r'^\+\s+', '• ', text, flags=re.MULTILINE)
    
    return text

def simplify_tables(text):
    """Convert tables to simple lists"""
    lines = text.split('\n')
    result = []
    in_table = False
    table_rows = []
    
    for line in lines:
        if '|' in line and '---' not in line and not line.startswith('|--'):
            in_table = True
            table_rows.append(line)
        elif in_table and ('|' not in line or line.strip() == ''):
            # End of table
            if table_rows:
                result.extend(convert_table_to_simple_list(table_rows))
                table_rows = []
            in_table = False
            result.append(line)
        elif in_table:
            table_rows.append(line)
        else:
            result.append(line)
    
    # Handle last table
    if table_rows:
        result.extend(convert_table_to_simple_list(table_rows))
    
    return '\n'.join(result)

def convert_table_to_simple_list(rows):
    """Convert table to simple bullet list"""
    if not rows:
        return []
    
    # Parse header
    header = rows[0]
    headers = [h.strip() for h in header.split('|') if h.strip()]
    
    converted = []
    
    # Add table as simple description
    if headers:
        converted.append(f"**Table: {', '.join(headers)}**")
    
    # Convert each row
    for row in rows[1:]:
        if '---' in row or '|--' in row:
            continue
        
        cells = [c.strip() for c in row.split('|') if c.strip()]
        if cells:
            # Join cells with commas
            converted.append(f"• {', '.join(cells)}")
    
    return converted

def simplify_links(text):
    """Simplify links to plain URLs"""
    # Convert [text](url) to url
    text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\2', text)
    
    # Remove image links
    text = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', '', text)
    
    return text

def optimize_whitespace(text):
    """Optimize whitespace for readability"""
    # Remove trailing whitespace
    text = re.sub(r'[ \t]+$', '', text, flags=re.MULTILINE)
    
    # Normalize line endings
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    
    # Remove excessive blank lines (more than 2)
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
    
    # Ensure text ends with single newline
    text = text.rstrip() + '\n'
    
    return text

def clean_markdown(text):
    """Clean and simplify markdown"""
    # Apply cleaning steps in order
    text = remove_complex_formatting(text)
    text = simplify_headers(text)
    text = simplify_tables(text)
    text = simplify_lists(text)
    text = simplify_links(text)
    text = optimize_whitespace(text)
    
    return text.strip()

def main():
    """Main function"""
    if len(sys.argv) < 2:
        print("Usage: python3 clean_markdown.py <input_file> [output_file]")
        print("       or pipe markdown to stdin")
        sys.exit(1)
    
    input_file = sys.argv[1]
    output_file = sys.argv[2] if len(sys.argv) > 2 else None
    
    # Read input
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            markdown_text = f.read()
    except FileNotFoundError:
        print(f"Error: File '{input_file}' not found")
        sys.exit(1)
    
    # Clean
    cleaned_text = clean_markdown(markdown_text)
    
    # Output
    if output_file:
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)
        print(f"✅ Cleaned markdown saved to: {output_file}")
    else:
        print(cleaned_text)
    
    # Show stats
    original_len = len(markdown_text)
    cleaned_len = len(cleaned_text)
    reduction = ((original_len - cleaned_len) / original_len * 100) if original_len > 0 else 0
    
    print(f"\n📊 Cleaning stats:", file=sys.stderr)
    print(f"   Original: {original_len} characters", file=sys.stderr)
    print(f"   Cleaned: {cleaned_len} characters", file=sys.stderr)
    print(f"   Reduction: {reduction:.1f}%", file=sys.stderr)
    print(f"   Lines: {cleaned_text.count(chr(10)) + 1}", file=sys.stderr)

if __name__ == "__main__":
    main()