The Palos Publishing Company

Follow Us On The X Platform @PalosPublishing
Categories We Write About

Make an HTML-to-Markdown converter

Here’s a simple Python script that converts basic HTML to Markdown. It handles common tags like headings, paragraphs, bold, italic, links, images, lists, and code. You can expand it as needed.

python
from html.parser import HTMLParser class HTMLToMarkdown(HTMLParser): def __init__(self): super().__init__() self.md = "" self.tag_stack = [] self.list_level = 0 self.list_type_stack = [] def handle_starttag(self, tag, attrs): self.tag_stack.append(tag) if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: level = int(tag[1]) self.md += 'n' + '#' * level + ' ' elif tag == 'p': self.md += 'nn' elif tag == 'strong' or tag == 'b': self.md += '**' elif tag == 'em' or tag == 'i': self.md += '*' elif tag == 'code': self.md += '`' elif tag == 'pre': self.md += 'n```n' elif tag == 'a': self.current_link = dict(attrs).get('href', '') self.md += '[' elif tag == 'img': attr_dict = dict(attrs) alt = attr_dict.get('alt', '') src = attr_dict.get('src', '') self.md += f'![{alt}]({src})' elif tag == 'ul': self.list_type_stack.append('ul') self.list_level += 1 elif tag == 'ol': self.list_type_stack.append('ol') self.list_level += 1 self.ol_counters = getattr(self, 'ol_counters', {}) self.ol_counters[self.list_level] = 1 elif tag == 'li': self.md += 'n' + ' ' * (self.list_level - 1) if self.list_type_stack and self.list_type_stack[-1] == 'ul': self.md += '- ' else: # ol num = self.ol_counters.get(self.list_level, 1) self.md += f'{num}. ' self.ol_counters[self.list_level] = num + 1 def handle_endtag(self, tag): popped = self.tag_stack.pop() if self.tag_stack else None if tag in ['strong', 'b']: self.md += '**' elif tag in ['em', 'i']: self.md += '*' elif tag == 'code': self.md += '`' elif tag == 'pre': self.md += 'n```n' elif tag == 'a': self.md += f']({self.current_link})' self.current_link = '' elif tag == 'ul': self.list_type_stack.pop() self.list_level -= 1 elif tag == 'ol': self.list_type_stack.pop() self.list_level -= 1 if self.list_level == 0: self.ol_counters = {} def handle_data(self, data): self.md += data def handle_entityref(self, name): # Convert common entities entities = { 'nbsp': ' ', 'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': "'" } self.md += entities.get(name, f'&{name};') def handle_charref(self, name): try: if name.startswith('x'): c = chr(int(name[1:], 16)) else: c = chr(int(name)) self.md += c except: self.md += f'&#{name};' def html_to_markdown(html): parser = HTMLToMarkdown() parser.feed(html) return parser.md.strip() # Example usage: if __name__ == '__main__': sample_html = ''' <h1>Title</h1> <p>This is a <strong>bold</strong> paragraph with <a href="https://example.com">a link</a>.</p> <ul> <li>Item one</li> <li>Item two</li> </ul> <pre><code>print("Hello World")</code></pre> <img src="image.jpg" alt="An image"> ''' md = html_to_markdown(sample_html) print(md)

This parser uses Python’s built-in HTMLParser. It converts common tags to their Markdown equivalents and handles nested lists, links, images, and code blocks. You can add more tag handlers if you need more HTML elements converted.

Share this Page your favorite way: Click any app below to share.

Enter your email below to join The Palos Publishing Company Email List

We respect your email privacy

Categories We Write About