Here’s a simple Python script that converts basic HTML to Markdown. It handles common tags like headings, paragraphs, bold, italic, links, images, lists, and code. You can expand it as needed.
pythonfrom html.parser import HTMLParser
class HTMLToMarkdown(HTMLParser):
def __init__(self):
super().__init__()
self.md = ""
self.tag_stack = []
self.list_level = 0
self.list_type_stack = []
def handle_starttag(self, tag, attrs):
self.tag_stack.append(tag)
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
level = int(tag[1])
self.md += 'n' + '#' * level + ' '
elif tag == 'p':
self.md += 'nn'
elif tag == 'strong' or tag == 'b':
self.md += '**'
elif tag == 'em' or tag == 'i':
self.md += '*'
elif tag == 'code':
self.md += '`'
elif tag == 'pre':
self.md += 'n```n'
elif tag == 'a':
self.current_link = dict(attrs).get('href', '')
self.md += '['
elif tag == 'img':
attr_dict = dict(attrs)
alt = attr_dict.get('alt', '')
src = attr_dict.get('src', '')
self.md += f''
elif tag == 'ul':
self.list_type_stack.append('ul')
self.list_level += 1
elif tag == 'ol':
self.list_type_stack.append('ol')
self.list_level += 1
self.ol_counters = getattr(self, 'ol_counters', {})
self.ol_counters[self.list_level] = 1
elif tag == 'li':
self.md += 'n' + ' ' * (self.list_level - 1)
if self.list_type_stack and self.list_type_stack[-1] == 'ul':
self.md += '- '
else: # ol
num = self.ol_counters.get(self.list_level, 1)
self.md += f'{num}. '
self.ol_counters[self.list_level] = num + 1
def handle_endtag(self, tag):
popped = self.tag_stack.pop() if self.tag_stack else None
if tag in ['strong', 'b']:
self.md += '**'
elif tag in ['em', 'i']:
self.md += '*'
elif tag == 'code':
self.md += '`'
elif tag == 'pre':
self.md += 'n```n'
elif tag == 'a':
self.md += f']({self.current_link})'
self.current_link = ''
elif tag == 'ul':
self.list_type_stack.pop()
self.list_level -= 1
elif tag == 'ol':
self.list_type_stack.pop()
self.list_level -= 1
if self.list_level == 0:
self.ol_counters = {}
def handle_data(self, data):
self.md += data
def handle_entityref(self, name):
# Convert common entities
entities = {
'nbsp': ' ',
'lt': '<',
'gt': '>',
'amp': '&',
'quot': '"',
'apos': "'"
}
self.md += entities.get(name, f'&{name};')
def handle_charref(self, name):
try:
if name.startswith('x'):
c = chr(int(name[1:], 16))
else:
c = chr(int(name))
self.md += c
except:
self.md += f'&#{name};'
def html_to_markdown(html):
parser = HTMLToMarkdown()
parser.feed(html)
return parser.md.strip()
# Example usage:
if __name__ == '__main__':
sample_html = '''
<h1>Title</h1>
<p>This is a <strong>bold</strong> paragraph with <a href="https://example.com">a link</a>.</p>
<ul>
<li>Item one</li>
<li>Item two</li>
</ul>
<pre><code>print("Hello World")</code></pre>
<img src="image.jpg" alt="An image">
'''
md = html_to_markdown(sample_html)
print(md)
This parser uses Python’s built-in HTMLParser. It converts common tags to their Markdown equivalents and handles nested lists, links, images, and code blocks. You can add more tag handlers if you need more HTML elements converted.
Share this Page your favorite way: Click any app below to share.