Write a Python script to clean HTML files

python
import os
import re
from bs4 import BeautifulSoup

def clean_html_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Parse HTML
    soup = BeautifulSoup(content, 'html.parser')

    # Remove script and style elements
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()

    # Remove comments
    for comment in soup.find_all(string=lambda text: isinstance(text, (type(soup.Comment)))):
        comment.extract()

    # Get clean text
    text = soup.get_text(separator='n')

    # Remove extra whitespace lines
    clean_lines = [line.strip() for line in text.splitlines() if line.strip()]
    clean_text = 'n'.join(clean_lines)

    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(clean_text)

def clean_html_files_in_folder(folder_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(folder_path):
        if filename.endswith('.html') or filename.endswith('.htm'):
            input_file = os.path.join(folder_path, filename)
            output_file = os.path.join(output_folder, filename.replace('.html', '.txt').replace('.htm', '.txt'))
            clean_html_file(input_file, output_file)
            print(f"Cleaned: {filename} -> {output_file}")

# Example usage
if __name__ == "__main__":
    folder = "html_files"          # Folder containing HTML files
    output_folder = "cleaned_txt"  # Folder to save cleaned text files
    clean_html_files_in_folder(folder, output_folder)

This script:

Loads HTML files from a folder.
Strips out scripts, styles, and comments.
Extracts and cleans readable text.
Saves cleaned text to a new folder as .txt files.

You can change folder and output_folder to your directories.

Share this Page your favorite way: Click any app below to share.

See all the ways to share this page

Check Out Our Newest Posts we wrote about

Why your ML system design must support partial retraining

Why your ML pipeline must detect missing or stale features

Why your ML feedback loop must consider label quality

Why your ML deployment plan must include fallback logic