Writing your own Markdown to LaTEX parser
What we want
Disclaimers
Overview of Steps
Let’s Make It!
Libraries
import markdown
import argparse as ap
from pathlib import Path
import re
from html.parser import HTMLParser
from html.entities import name2codepoint
Base Templates
default_template = """
\\documentclass[12pt]{article}
\\usepackage[a4paper, total={6in, 8in}]{geometry}
\\usepackage[utf8]{inputenc}
\\usepackage[T1]{fontenc}
\\usepackage[english]{babel}
\\usepackage{graphicx}
\\usepackage[dvipsnames]{xcolor}
\\usepackage{hyperref}
\\usepackage{listings}
\\newcommand\myshade{85}
\\colorlet{mylinkcolor}{violet}
\\colorlet{mycitecolor}{YellowOrange}
\\colorlet{myurlcolor}{Aquamarine}
\\hypersetup{
linkcolor = mylinkcolor!\\myshade!black,
citecolor = mycitecolor!\\myshade!black,
urlcolor = myurlcolor!\\myshade!black,
colorlinks = true,
}
\\author{}
"""
HTML Parser
class MyHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.attrs = []
def handle_starttag(self, tag, attrs):
for attr in attrs:
self.attrs.append(attr)
def get_attrs(self):
return self.attrs
def handle_endtag(self, tag):
pass
def handle_data(self, data):
print("Data :", data)
def handle_comment(self, data):
print("Comment :", data)
def handle_entityref(self, name):
c = chr(name2codepoint[name])
print("Named ent:", c)
def handle_charref(self, name):
if name.startswith('x'):
c = chr(int(name[1:], 16))
else:
c = chr(int(name))
print("Num ent :", c)
def handle_decl(self, data):
print("Decl :", data)
def get_html_attributes(text):
parser = MyHTMLParser()
parser.feed(text)
return parser.get_attrs()
Replace strings
replacer_dict = {
"<head>" : "",
"</head>" : "",
"<html>" : "",
"</html>" : "",
"<p>" : "",
"</p>" : "",
"<h1>" : "\\begin{document}\n\\toc: true
title{",
"</h1>" : "}\n\\maketoc: true
title{}\n",
"<h2>" : "\\section{",
"<h3>" : "\\subsection{",
"<h4>" : "\\subsubsection{",
# "<body>" : "\\begin{document}\n",
"</body>" : "\\end{document}\n",
"<ul>" : "\\begin{itemize}\n",
"</ul>" : "\\end{itemize}\n",
"<il>" : "\\begin{enumerate}\n",
"</il>" : "\\end{enumerate}\n",
"<code>" : "\\begin{lstlisting}[language=Python]\n",
"</code>" : "\\end{lstlisting}\n",
"<li>" : "\\item ",
"</li>" : "",
"%": "\%",
"&": "\&",
}
Close Tags
def add_end_brace(list_of_vals, replacer_dict):
list_of_vals = [x.strip() for x in list_of_vals.split(",")]
for i in list_of_vals:
replacer_dict[i.replace("<", "</")] = "}\n"
add_end_brace(
list_of_vals="<h2>, <h3>, <h4>",
replacer_dict=replacer_dict
)
Images
def figure_code(text):
found_links = re.findall('\<img .* \/>' , text)
for link in found_links:
attrs = get_html_attributes(link)
caption_data = ""
file_path = ""
for i in attrs:
if i[0] == "alt":
caption_data = i[1]
if i[0] == "src":
file_path = i[1]
gen_latex = "\\begin{figure}[!htbp]\n\centering\n\includegraphics[width=.75\columnwidth]{"+file_path+"}\n\caption{"+caption_data+"}\n\label{}\n\end{figure}"
text = text.replace(link, gen_latex)
return text
CLI input
ags = ap.ArgumentParser("md2tex")
ags.add_argument("-f", help="Full file path", required=True)
ags.add_argument("-d", help="Insert default formatting code", action='store_true')
aps = ags.parse_args()
f_name = Path(aps.f)
Running the pipeline
# Read the file
with open(f_name, 'r') as f:
text = f.read()
html = markdown.markdown(text)
# Replacing things
text = figure_code(html)
for key in replacer_dict.keys():
text = text.replace(key, replacer_dict[key])
# Write the file
with open(f_name.parent/f"{f_name.stem}.tex", 'w') as f:
if aps.d:
f.write(default_template)
f.write(text)
if aps.d:
f.write("\\end{document}")
Fin
This article is in the hopes that it will help someone out. Maybe have the help that I did not. I do not know who it will reach. But to whoever it does, best of luck :)
Like these/Want more? Buy me a coffee! Kofi
Want articles on something specific? Just ask!
You can always contact me on LinkedIn, or drop me an [[mailto:msubhaditya@gmail.com|Email]]. For all the code, drop by my Github.