Write a Python script to clean HTML files

python
import os
import re
from bs4 import BeautifulSoup

def clean_html_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Parse HTML
    soup = BeautifulSoup(content, 'html.parser')

    # Remove script and style elements
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()

    # Remove comments
    for comment in soup.find_all(string=lambda text: isinstance(text, (type(soup.Comment)))):
        comment.extract()

    # Get clean text
    text = soup.get_text(separator='n')

    # Remove extra whitespace lines
    clean_lines = [line.strip() for line in text.splitlines() if line.strip()]
    clean_text = 'n'.join(clean_lines)

    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(clean_text)

def clean_html_files_in_folder(folder_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(folder_path):
        if filename.endswith('.html') or filename.endswith('.htm'):
            input_file = os.path.join(folder_path, filename)
            output_file = os.path.join(output_folder, filename.replace('.html', '.txt').replace('.htm', '.txt'))
            clean_html_file(input_file, output_file)
            print(f"Cleaned: {filename} -> {output_file}")

# Example usage
if __name__ == "__main__":
    folder = "html_files"          # Folder containing HTML files
    output_folder = "cleaned_txt"  # Folder to save cleaned text files
    clean_html_files_in_folder(folder, output_folder)

This script:

Loads HTML files from a folder.
Strips out scripts, styles, and comments.
Extracts and cleans readable text.
Saves cleaned text to a new folder as .txt files.

You can change folder and output_folder to your directories.

Share This Page:

Comments