pythonimport os
import re
from bs4 import BeautifulSoup
def clean_html_file(input_path, output_path):
with open(input_path, 'r', encoding='utf-8') as file:
content = file.read()
# Parse HTML
soup = BeautifulSoup(content, 'html.parser')
# Remove script and style elements
for script_or_style in soup(['script', 'style']):
script_or_style.decompose()
# Remove comments
for comment in soup.find_all(string=lambda text: isinstance(text, (type(soup.Comment)))):
comment.extract()
# Get clean text
text = soup.get_text(separator='n')
# Remove extra whitespace lines
clean_lines = [line.strip() for line in text.splitlines() if line.strip()]
clean_text = 'n'.join(clean_lines)
with open(output_path, 'w', encoding='utf-8') as file:
file.write(clean_text)
def clean_html_files_in_folder(folder_path, output_folder):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for filename in os.listdir(folder_path):
if filename.endswith('.html') or filename.endswith('.htm'):
input_file = os.path.join(folder_path, filename)
output_file = os.path.join(output_folder, filename.replace('.html', '.txt').replace('.htm', '.txt'))
clean_html_file(input_file, output_file)
print(f"Cleaned: {filename} -> {output_file}")
# Example usage
if __name__ == "__main__":
folder = "html_files" # Folder containing HTML files
output_folder = "cleaned_txt" # Folder to save cleaned text files
clean_html_files_in_folder(folder, output_folder)
This script:
-
Loads HTML files from a folder.
-
Strips out scripts, styles, and comments.
-
Extracts and cleans readable text.
-
Saves cleaned text to a new folder as
.txt
files.
You can change folder
and output_folder
to your directories.
Leave a Reply