Categories We Write About

Write a Python script to clean HTML files

python
import os import re from bs4 import BeautifulSoup def clean_html_file(input_path, output_path): with open(input_path, 'r', encoding='utf-8') as file: content = file.read() # Parse HTML soup = BeautifulSoup(content, 'html.parser') # Remove script and style elements for script_or_style in soup(['script', 'style']): script_or_style.decompose() # Remove comments for comment in soup.find_all(string=lambda text: isinstance(text, (type(soup.Comment)))): comment.extract() # Get clean text text = soup.get_text(separator='n') # Remove extra whitespace lines clean_lines = [line.strip() for line in text.splitlines() if line.strip()] clean_text = 'n'.join(clean_lines) with open(output_path, 'w', encoding='utf-8') as file: file.write(clean_text) def clean_html_files_in_folder(folder_path, output_folder): if not os.path.exists(output_folder): os.makedirs(output_folder) for filename in os.listdir(folder_path): if filename.endswith('.html') or filename.endswith('.htm'): input_file = os.path.join(folder_path, filename) output_file = os.path.join(output_folder, filename.replace('.html', '.txt').replace('.htm', '.txt')) clean_html_file(input_file, output_file) print(f"Cleaned: {filename} -> {output_file}") # Example usage if __name__ == "__main__": folder = "html_files" # Folder containing HTML files output_folder = "cleaned_txt" # Folder to save cleaned text files clean_html_files_in_folder(folder, output_folder)

This script:

  • Loads HTML files from a folder.

  • Strips out scripts, styles, and comments.

  • Extracts and cleans readable text.

  • Saves cleaned text to a new folder as .txt files.

You can change folder and output_folder to your directories.

Share This Page:

Enter your email below to join The Palos Publishing Company Email List

We respect your email privacy

Comments

Leave a Reply

Your email address will not be published. Required fields are marked *

Categories We Write About