Make an HTML-to-Markdown converter

Here’s a simple Python script that converts basic HTML to Markdown. It handles common tags like headings, paragraphs, bold, italic, links, images, lists, and code. You can expand it as needed.

python
from html.parser import HTMLParser

class HTMLToMarkdown(HTMLParser):
    def __init__(self):
        super().__init__()
        self.md = ""
        self.tag_stack = []
        self.list_level = 0
        self.list_type_stack = []

    def handle_starttag(self, tag, attrs):
        self.tag_stack.append(tag)

        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            level = int(tag[1])
            self.md += 'n' + '#' * level + ' '
        elif tag == 'p':
            self.md += 'nn'
        elif tag == 'strong' or tag == 'b':
            self.md += '**'
        elif tag == 'em' or tag == 'i':
            self.md += '*'
        elif tag == 'code':
            self.md += '`'
        elif tag == 'pre':
            self.md += 'n```n'
        elif tag == 'a':
            self.current_link = dict(attrs).get('href', '')
            self.md += '['
        elif tag == 'img':
            attr_dict = dict(attrs)
            alt = attr_dict.get('alt', '')
            src = attr_dict.get('src', '')
            self.md += f'![{alt}]({src})'
        elif tag == 'ul':
            self.list_type_stack.append('ul')
            self.list_level += 1
        elif tag == 'ol':
            self.list_type_stack.append('ol')
            self.list_level += 1
            self.ol_counters = getattr(self, 'ol_counters', {})
            self.ol_counters[self.list_level] = 1
        elif tag == 'li':
            self.md += 'n' + '  ' * (self.list_level - 1)
            if self.list_type_stack and self.list_type_stack[-1] == 'ul':
                self.md += '- '
            else:  # ol
                num = self.ol_counters.get(self.list_level, 1)
                self.md += f'{num}. '
                self.ol_counters[self.list_level] = num + 1

    def handle_endtag(self, tag):
        popped = self.tag_stack.pop() if self.tag_stack else None

        if tag in ['strong', 'b']:
            self.md += '**'
        elif tag in ['em', 'i']:
            self.md += '*'
        elif tag == 'code':
            self.md += '`'
        elif tag == 'pre':
            self.md += 'n```n'
        elif tag == 'a':
            self.md += f']({self.current_link})'
            self.current_link = ''
        elif tag == 'ul':
            self.list_type_stack.pop()
            self.list_level -= 1
        elif tag == 'ol':
            self.list_type_stack.pop()
            self.list_level -= 1
            if self.list_level == 0:
                self.ol_counters = {}

    def handle_data(self, data):
        self.md += data

    def handle_entityref(self, name):
        # Convert common entities
        entities = {
            'nbsp': ' ',
            'lt': '<',
            'gt': '>',
            'amp': '&',
            'quot': '"',
            'apos': "'"
        }
        self.md += entities.get(name, f'&{name};')

    def handle_charref(self, name):
        try:
            if name.startswith('x'):
                c = chr(int(name[1:], 16))
            else:
                c = chr(int(name))
            self.md += c
        except:
            self.md += f'&#{name};'

def html_to_markdown(html):
    parser = HTMLToMarkdown()
    parser.feed(html)
    return parser.md.strip()

# Example usage:
if __name__ == '__main__':
    sample_html = '''
    <h1>Title</h1>
    <p>This is a <strong>bold</strong> paragraph with <a href="https://example.com">a link</a>.</p>
    <ul>
        <li>Item one</li>
        <li>Item two</li>
    </ul>
    <pre><code>print("Hello World")</code></pre>
    <img src="image.jpg" alt="An image">
    '''
    md = html_to_markdown(sample_html)
    print(md)

This parser uses Python’s built-in HTMLParser. It converts common tags to their Markdown equivalents and handles nested lists, links, images, and code blocks. You can add more tag handlers if you need more HTML elements converted.

Share this Page your favorite way: Click any app below to share.

See all the ways to share this page

Check Out Our Newest Posts we wrote about

Why your ML system design must support partial retraining

Why your ML pipeline must detect missing or stale features

Why your ML feedback loop must consider label quality

Why your ML deployment plan must include fallback logic