From 8f96ef690b0e0a34a4cd3ec2a6ae25f9ef3a5bcb Mon Sep 17 00:00:00 2001
From: Yuhang Tao <35098797+TITC@users.noreply.github.com>
Date: Wed, 27 Apr 2022 18:16:34 +0800
Subject: [PATCH 1/8] render accelerate&small problem fix

---
 pix2tex/dataset/latex2png.py |  81 ++++++++++++----
 pix2tex/dataset/render.py    | 178 +++++++++++++++++++++++------------
 2 files changed, 177 insertions(+), 82 deletions(-)

diff --git a/pix2tex/dataset/latex2png.py b/pix2tex/dataset/latex2png.py
index 63060c4..222e269 100644
--- a/pix2tex/dataset/latex2png.py
+++ b/pix2tex/dataset/latex2png.py
@@ -6,7 +6,9 @@
 import io
 import glob
 import tempfile
+import shlex
 import subprocess
+import traceback
 from PIL import Image
 
 
@@ -27,6 +29,8 @@ def __init__(self, math, dpi=250, font='Latin Modern Math'):
         self.math = math
         self.dpi = dpi
         self.font = font
+        self.prefix_line = self.BASE.split("\n").index(
+            "%s")  # used for calculate error formula index
 
     def write(self, return_bytes=False):
         # inline = bool(re.match('^\$[^$]*\$$', self.math)) and False
@@ -39,8 +43,9 @@ def write(self, return_bytes=False):
                 # print(document)
                 f.write(document)
 
-            png = self.convert_file(texfile, workdir, return_bytes=return_bytes)
-            return png
+            png, error_index = self.convert_file(
+                texfile, workdir, return_bytes=return_bytes)
+            return png, error_index
 
         finally:
             if os.path.exists(texfile):
@@ -53,32 +58,40 @@ def convert_file(self, infile, workdir, return_bytes=False):
 
         try:
             # Generate the PDF file
-            cmd = 'xelatex -halt-on-error -output-directory %s %s' % (workdir, infile)
+            #  not stop on error line, but return error line index,index start from 1
+            cmd = 'xelatex -interaction nonstopmode -file-line-error -output-directory %s %s' % (
+                workdir, infile)
 
             p = subprocess.Popen(
-                cmd,
-                shell=True,
+                shlex.split(cmd),
                 stdin=subprocess.PIPE,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
+                universal_newlines=True
             )
             sout, serr = p.communicate()
-            # Something bad happened, abort
-            if p.returncode != 0:
-                raise Exception('latex error', serr, sout)
-
+            # extract error line from sout
+            error_index, _ = extract(text=sout, expression="%s:(\d+)" % infile)
+            # extract success rendered equation
+            if error_index != []:
+                # offset index start from 0, same as self.math
+                error_index = [int(_)-self.prefix_line-1 for _ in error_index]
             # Convert the PDF file to PNG's
             pdffile = infile.replace('.tex', '.pdf')
+            result, _ = extract(
+                text=sout, expression="Output written on %s \((.*)? pages\)" % pdffile)
+            if int(result[0]) != len(self.math):
+                raise Exception('xelatex rendering error, generated %d formula\'s page, but the total number of formulas is %d.' % (
+                    int(result[0]), len(self.math)))
             pngfile = os.path.join(workdir, infile.replace('.tex', '.png'))
 
-            cmd = 'magick convert -density %i -colorspace gray %s -quality 90 %s' % (
+            cmd = 'convert -density %i -colorspace gray %s -quality 90 %s' % (
                 self.dpi,
                 pdffile,
                 pngfile,
             )  # -bg Transparent -z 9
             p = subprocess.Popen(
-                cmd,
-                shell=True,
+                shlex.split(cmd),
                 stdin=subprocess.PIPE,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
@@ -86,18 +99,23 @@ def convert_file(self, infile, workdir, return_bytes=False):
 
             sout, serr = p.communicate()
             if p.returncode != 0:
-                raise Exception('PDFpng error', serr, cmd, os.path.exists(pdffile), os.path.exists(infile))
+                raise Exception('PDFpng error', serr, cmd, os.path.exists(
+                    pdffile), os.path.exists(infile))
             if return_bytes:
                 if len(self.math) > 1:
-                    png = [open(pngfile.replace('.png', '')+'-%i.png' % i, 'rb').read() for i in range(len(self.math))]
+                    png = [open(pngfile.replace('.png', '')+'-%i.png' %
+                                i, 'rb').read() for i in range(len(self.math))]
                 else:
-                    png = [open(pngfile.replace('.png', '')+'.png', 'rb').read()]
-                return png
+                    png = [open(pngfile.replace(
+                        '.png', '')+'.png', 'rb').read()]
             else:
+                # return path
                 if len(self.math) > 1:
-                    return [(pngfile.replace('.png', '')+'-%i.png' % i) for i in range(len(self.math))]
+                    png = [(pngfile.replace('.png', '')+'-%i.png' % i)
+                           for i in range(len(self.math))]
                 else:
-                    return (pngfile.replace('.png', '')+'.png')
+                    png = [(pngfile.replace('.png', '')+'.png')]
+            return png, error_index
         finally:
             # Cleanup temporaries
             basefile = infile.replace('.tex', '')
@@ -122,9 +140,32 @@ def tex2png(eq, **kwargs):
 
 
 def tex2pil(tex, **kwargs):
-    pngs = Latex(tex, **kwargs).write(return_bytes=True)
+    pngs, error_index = Latex(tex, **kwargs).write(return_bytes=True)
     images = [Image.open(io.BytesIO(d)) for d in pngs]
-    return images
+    return images, error_index
+
+
+def extract(text, expression=None, type: str = None):
+    """extract text from text by regular expression
+
+    Args:
+        text (str): input text
+        expression (str, optional): regular expression. Defaults to None.
+        type (str, optional): type of extracted text. Defaults to None.
+
+    Returns:
+        str: extracted text
+    """
+    if type is not None:
+        type2expression = {"en": r"[a-zA-Z]+", "zh": r"[\u4e00-\u9fa5]+", "num": r"\d+",
+                           "punctuation": u"[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]"}
+        expression = type2expression[type]
+    try:
+        pattern = re.compile(expression)
+        results = re.findall(pattern, text)
+        return results, True if len(results) != 0 else False
+    except Exception:
+        traceback.print_exc()
 
 
 if __name__ == '__main__':
diff --git a/pix2tex/dataset/render.py b/pix2tex/dataset/render.py
index a1790bf..6ff69ae 100644
--- a/pix2tex/dataset/render.py
+++ b/pix2tex/dataset/render.py
@@ -1,4 +1,5 @@
-from pix2tex.dataset.latex2png import *
+
+from pix2tex.dataset.latex2png import Latex, tex2pil
 import argparse
 import sys
 import os
@@ -7,112 +8,165 @@
 from tqdm.auto import tqdm
 import cv2
 import numpy as np
+from PIL import Image
+import traceback
+import subprocess
+import shlex
 
 
-def render_dataset(dataset: np.ndarray, names: np.ndarray, args):
-    '''Renders a list of tex equations
+def get_installed_fonts(tex_path: str):
+    cmd = "find %s -name *Math*.otf" % tex_path
+    process = subprocess.Popen(shlex.split(cmd),
+                               stdout=subprocess.PIPE,
+                               stderr=subprocess.PIPE,
+                               universal_newlines=True
+                               )
+    stdout, stderr = process.communicate()
+    if process.returncode != 0:
+        raise Exception(stderr)
+    fonts = [_.split(os.sep)[-1] for _ in stdout.split('\n')][:-1]
+    fonts.append("Latin Modern Math")
+    return fonts
 
+
+def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args):
+    '''Renders a list of tex equations
     Args:
         dataset (numpy.ndarray): List of equations
-        names (numpy.ndarray): List of integers of size `dataset` that give the name of the saved image
+        unrenders (numpy.ndarray): List of integers of size `dataset` that give the name of the saved image
         args (Union[Namespace, Munch]): additional arguments: mode (equation or inline), out (output directory), divable (common factor )
                                         batchsize (how many samples to render at once), dpi, font (Math font), preprocess (crop, alpha off)
                                         shuffle (bool)
-
     Returns:
         list: equation indices that could not be rendered. 
     '''
-    assert len(names) == len(dataset), 'names and dataset must be of equal size'
+    assert len(unrenders) == len(
+        dataset), 'unrenders and dataset must be of equal size'
     math_mode = '$$'if args.mode == 'equation' else '$'
     os.makedirs(args.out, exist_ok=True)
-    indices = np.array([int(os.path.basename(img).split('.')[0]) for img in glob.glob(os.path.join(args.out, '*.png'))])
-
-    valid = [i for i, j in enumerate(names) if j not in indices]
+    # remove successfully rendered equations
+    rendered = np.array([int(os.path.basename(img).split('.')[0])
+                        for img in glob.glob(os.path.join(args.out, '*.png'))])
+    valid = [i for i, j in enumerate(unrenders) if j not in rendered]
+    # update unrenders and dataset
     dataset = dataset[valid]
-    names = names[valid]
-    order = np.random.permutation(len(dataset)) if args.shuffle else np.arange(len(dataset))
+    unrenders = unrenders[valid]
+    order = np.random.permutation(
+        len(dataset)) if args.shuffle else np.arange(len(dataset))
     faulty = []
-    for i in tqdm(range(0, len(dataset), args.batchsize)):
-        batch = dataset[order[i:i+args.batchsize]]
+    for batch_offset in tqdm(range(0, len(dataset), args.batchsize), desc="global batch index"):
+        batch = dataset[order[batch_offset:batch_offset+args.batchsize]]
         #batch = [x for j, x in enumerate(batch) if order[i+j] not in indices]
         if len(batch) == 0:
             continue
-        math = [math_mode+x+math_mode for x in batch if x != '']
+        valid_math = np.asarray([[i, "%s %s %s" % (math_mode, x, math_mode)] for i, x in enumerate(
+            batch) if x != ''], dtype=object)  # space used to prevent escape $
         #print('\n', i, len(math), '\n'.join(math))
-        if len(args.font) > 1:
-            font = np.random.choice(args.font)
-        else:
-            font = args.font[0]
-        if len(args.dpi) > 1:
-            dpi = np.random.choice(np.arange(min(args.dpi), max(args.dpi)))
-        else:
-            dpi = args.dpi[0]
-        if len(math) > 0:
+        font = font = np.random.choice(args.font) if len(
+            args.font) > 1 else args.font[0]
+        dpi = np.random.choice(np.arange(min(args.dpi), max(args.dpi))) if len(
+            args.dpi) > 1 else args.dpi[0]
+        if len(valid_math) > 0:
+            valid_idx, math = valid_math.T
+            valid_idx = valid_idx.astype(np.int32)
             try:
                 if args.preprocess:
-                    pngs = tex2pil(math, dpi=dpi, font=font)
+                    pngs, error_index = tex2pil(math, dpi=dpi, font=font)
                 else:
-                    pngs = Latex(math, dpi=dpi, font=font).write(return_bytes=False)
+                    pngs, error_index = Latex(math, dpi=dpi, font=font).write(
+                        return_bytes=False)
+                # error_index not count "" line, use valid_idx transfer to real index matching in batch index
+                local_error_index = valid_idx[error_index]
+                # tranfer in batch index to global batch index
+                global_error_index = [
+                    batch_offset+_ for _ in local_error_index]
+                faulty.extend(list(unrenders[order[global_error_index]]))
             except Exception as e:
-                #print(e)
-                #print(math)
-                #raise e
-                faulty.extend(list(names[order[i:i+args.batchsize]]))
+                print("\n%s" % e, end='')
+                faulty.extend(
+                    list(unrenders[order[batch_offset:batch_offset+args.batchsize]]))
                 continue
 
-            for j, k in enumerate(range(i, i+len(pngs))):
-                outpath = os.path.join(args.out, '%07d.png' % names[order[k]])
+            for inbatch_idx, order_idx in enumerate(range(batch_offset, batch_offset+args.batchsize)):
+                # exclude render failed equations and blank line
+                if inbatch_idx in local_error_index or inbatch_idx not in valid_idx:
+                    continue
+                outpath = os.path.join(args.out, '%07d.png' %
+                                       unrenders[order[order_idx]])
+                png_idx = np.where(valid_idx == inbatch_idx)[0][0]
                 if args.preprocess:
                     try:
-                        data = np.asarray(pngs[j])
+                        data = np.asarray(pngs[png_idx])
                         # print(data.shape)
-                        gray = 255*(data[..., 0] < 128).astype(np.uint8)  # To invert the text to white
-                        coords = cv2.findNonZero(gray)  # Find all non-zero points (text)
-                        a, b, w, h = cv2.boundingRect(coords)  # Find minimum spanning bounding box
+                        # To invert the text to white
+                        gray = 255*(data[..., 0] < 128).astype(np.uint8)
+                        white_pixels = np.sum(gray == 255)
+                        # some png will be whole white, because some equation's syntax is wrong
+                        # eg.$$ \mathit { \Iota \Kappa \Lambda \Mu \Nu \Xi \Omicron \Pi } $$
+                        # extract from wikipedia english dump file https://dumps.wikimedia.org/enwiki/latest/
+                        white_percentage = (
+                            white_pixels / (gray.shape[0] * gray.shape[1]))
+                        if white_percentage == 0:
+                            continue
+                        # Find all non-zero points (text)
+                        coords = cv2.findNonZero(gray)
+                        # Find minimum spanning bounding box
+                        a, b, w, h = cv2.boundingRect(coords)
                         rect = data[b:b+h, a:a+w]
-                        im = Image.fromarray((255-rect[..., -1]).astype(np.uint8)).convert('L')
+                        im = Image.fromarray(
+                            (255-rect[..., -1]).astype(np.uint8)).convert('L')
                         dims = []
                         for x in [w, h]:
                             div, mod = divmod(x, args.divable)
-                            dims.append(args.divable*(div + (1 if mod > 0 else 0)))
+                            dims.append(
+                                args.divable*(div + (1 if mod > 0 else 0)))
                         padded = Image.new('L', dims, 255)
-                        padded.paste(im, im.getbbox())
+                        padded.paste(im, (0, 0, im.size[0], im.size[1]))
                         padded.save(outpath)
                     except Exception as e:
                         print(e)
                         pass
                 else:
-                    shutil.move(pngs[j], outpath)
-
+                    shutil.move(pngs[png_idx], outpath)
+    # prevent repeat between two error_index and imagemagic error
+    faulty = list(set(faulty))
+    faulty.sort()
     return np.array(faulty)
 
 
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(description='Render dataset')
-    parser.add_argument('-i', '--data', type=str, required=True, help='file of list of latex code')
-    parser.add_argument('-o', '--out', type=str, required=True, help='output directory')
-    parser.add_argument('-b', '--batchsize', type=int, default=100, help='How many equations to render at once')
-    parser.add_argument('-f', '--font', nargs='+', type=str, default=['Latin Modern Math', 'GFSNeohellenicMath.otf', 'Asana Math', 'XITS Math',
-                                                                      'Cambria Math', 'Latin Modern Math', 'Latin Modern Math', 'Latin Modern Math'], help='font to use. default = Latin Modern Math')
-    parser.add_argument('-m', '--mode', choices=['inline', 'equation'], default='equation', help='render as inline or equation')
-    parser.add_argument('--dpi', type=int, default=[110, 170], nargs='+', help='dpi range to render in')
-    parser.add_argument('-p', '--no-preprocess', dest='preprocess', default=True, action='store_false', help='crop, remove alpha channel, padding')
-    parser.add_argument('-d', '--divable', type=int, default=32, help='To what factor to pad the images')
-    parser.add_argument('-s', '--shuffle', action='store_true', help='Whether to shuffle the equations in the first iteration')
+    parser.add_argument('-i', '--data', type=str,
+                        required=True, help='file of list of latex code')
+    parser.add_argument('-o', '--out', type=str,
+                        required=True, help='output directory')
+    parser.add_argument('-b', '--batchsize', type=int, default=100,
+                        help='How many equations to render at once')
+    parser.add_argument('-f', '--font', nargs='+', type=str,
+                        default="", help='font to use.')
+    parser.add_argument('-fp', '--fonts_path', type=str,
+                        default="/usr/local/texlive/", help='installed font path')
+    parser.add_argument('-m', '--mode', choices=[
+                        'inline', 'equation'], default='equation', help='render as inline or equation')
+    parser.add_argument(
+        '--dpi', type=int, default=[110, 170], nargs='+', help='dpi range to render in')
+    parser.add_argument('-p', '--no-preprocess', dest='preprocess', default=True,
+                        action='store_false', help='crop, remove alpha channel, padding')
+    parser.add_argument('-d', '--divable', type=int, default=32,
+                        help='To what factor to pad the images')
+    parser.add_argument('-s', '--shuffle', action='store_true',
+                        help='Whether to shuffle the equations in the first iteration')
     args = parser.parse_args(sys.argv[1:])
-
+    args.font = args.font if args.font != "" else get_installed_fonts(
+        args.fonts_path)
+    print(args.font)
     dataset = np.array(open(args.data, 'r').read().split('\n'), dtype=object)
-    names = np.arange(len(dataset))
-    prev_names = None
-    for i in range(12):
-        if len(names) == 0:
-            break
-        prev_names = names
-        names = render_dataset(dataset[names], names, args)
-        same = names == prev_names
-        if (type(same) == bool and same) or (type(same) == np.ndarray and same.all()) or (args.batchsize == 1):
-            break
-        if len(names) < 50*args.batchsize:
+    unrenders = np.arange(len(dataset))
+    failed = np.array([])
+    while unrenders.tolist() != failed.tolist():
+        failed = unrenders
+        unrenders = render_dataset(dataset[unrenders], unrenders, args)
+        if len(unrenders) < 50*args.batchsize:
             args.batchsize = max([1, args.batchsize//2])
         args.shuffle = True

From 1a9a922091217e17942b9835909367c241283e79 Mon Sep 17 00:00:00 2001
From: Yuhang Tao <35098797+TITC@users.noreply.github.com>
Date: Thu, 28 Apr 2022 20:56:45 +0800
Subject: [PATCH 2/8] delete useless code

```python
    if type is not None:
        type2expression = {"en": r"[a-zA-Z]+", "zh": r"[\u4e00-\u9fa5]+", "num": r"\d+",
                           "punctuation": u"[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]"}
        expression = type2expression[type]
```
---
 pix2tex/dataset/latex2png.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/pix2tex/dataset/latex2png.py b/pix2tex/dataset/latex2png.py
index 222e269..03a6b6a 100644
--- a/pix2tex/dataset/latex2png.py
+++ b/pix2tex/dataset/latex2png.py
@@ -145,21 +145,16 @@ def tex2pil(tex, **kwargs):
     return images, error_index
 
 
-def extract(text, expression=None, type: str = None):
+def extract(text, expression=None):
     """extract text from text by regular expression
 
     Args:
         text (str): input text
         expression (str, optional): regular expression. Defaults to None.
-        type (str, optional): type of extracted text. Defaults to None.
 
     Returns:
         str: extracted text
     """
-    if type is not None:
-        type2expression = {"en": r"[a-zA-Z]+", "zh": r"[\u4e00-\u9fa5]+", "num": r"\d+",
-                           "punctuation": u"[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]"}
-        expression = type2expression[type]
     try:
         pattern = re.compile(expression)
         results = re.findall(pattern, text)

From b40a2934d899389a05bc48c597071ddec9275f9c Mon Sep 17 00:00:00 2001
From: TITC <yuhang.tao.email@gmail.com>
Date: Thu, 28 Apr 2022 21:47:49 +0800
Subject: [PATCH 3/8] fix break cli problem

---
 pix2tex/dataset/latex2png.py | 2 +-
 pix2tex/dataset/render.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pix2tex/dataset/latex2png.py b/pix2tex/dataset/latex2png.py
index 03a6b6a..4c1f866 100644
--- a/pix2tex/dataset/latex2png.py
+++ b/pix2tex/dataset/latex2png.py
@@ -142,7 +142,7 @@ def tex2png(eq, **kwargs):
 def tex2pil(tex, **kwargs):
     pngs, error_index = Latex(tex, **kwargs).write(return_bytes=True)
     images = [Image.open(io.BytesIO(d)) for d in pngs]
-    return images, error_index
+    return images, error_index if kwargs.get("error_index", False) else images
 
 
 def extract(text, expression=None):
diff --git a/pix2tex/dataset/render.py b/pix2tex/dataset/render.py
index 6ff69ae..3b7fad2 100644
--- a/pix2tex/dataset/render.py
+++ b/pix2tex/dataset/render.py
@@ -71,7 +71,7 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args):
             valid_idx = valid_idx.astype(np.int32)
             try:
                 if args.preprocess:
-                    pngs, error_index = tex2pil(math, dpi=dpi, font=font)
+                    pngs, error_index = tex2pil(math, dpi=dpi, font=font, error_index=True)
                 else:
                     pngs, error_index = Latex(math, dpi=dpi, font=font).write(
                         return_bytes=False)

From 0776fd643c834cb73847564d2aa88783cd38a0bf Mon Sep 17 00:00:00 2001
From: TITC <yuhang.tao.email@gmail.com>
Date: Thu, 28 Apr 2022 21:56:23 +0800
Subject: [PATCH 4/8] Increase the weight of LM fonts

---
 pix2tex/dataset/render.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pix2tex/dataset/render.py b/pix2tex/dataset/render.py
index 3b7fad2..27bbac9 100644
--- a/pix2tex/dataset/render.py
+++ b/pix2tex/dataset/render.py
@@ -25,7 +25,7 @@ def get_installed_fonts(tex_path: str):
     if process.returncode != 0:
         raise Exception(stderr)
     fonts = [_.split(os.sep)[-1] for _ in stdout.split('\n')][:-1]
-    fonts.append("Latin Modern Math")
+    fonts.extend(["Latin Modern Math"]*len(fonts))
     return fonts
 
 
@@ -71,7 +71,8 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args):
             valid_idx = valid_idx.astype(np.int32)
             try:
                 if args.preprocess:
-                    pngs, error_index = tex2pil(math, dpi=dpi, font=font, error_index=True)
+                    pngs, error_index = tex2pil(
+                        math, dpi=dpi, font=font, error_index=True)
                 else:
                     pngs, error_index = Latex(math, dpi=dpi, font=font).write(
                         return_bytes=False)

From c469cec890367b89dac70d4c8b842aa37a0de404 Mon Sep 17 00:00:00 2001
From: Lukas Blecher <luk.blecher@gmail.com>
Date: Thu, 28 Apr 2022 16:06:15 +0200
Subject: [PATCH 5/8] `unerenders` to `unerendered`

Add windows support:
- use shell in get_installed_fonts
- resolve backslash and `convert` to `magick convert`
---
 pix2tex/dataset/latex2png.py |  8 ++++--
 pix2tex/dataset/render.py    | 54 +++++++++++++++---------------------
 2 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/pix2tex/dataset/latex2png.py b/pix2tex/dataset/latex2png.py
index 222e269..96acb37 100644
--- a/pix2tex/dataset/latex2png.py
+++ b/pix2tex/dataset/latex2png.py
@@ -55,12 +55,12 @@ def write(self, return_bytes=False):
                     pass
 
     def convert_file(self, infile, workdir, return_bytes=False):
-
+        infile = infile.replace('\\', '/')
         try:
             # Generate the PDF file
             #  not stop on error line, but return error line index,index start from 1
             cmd = 'xelatex -interaction nonstopmode -file-line-error -output-directory %s %s' % (
-                workdir, infile)
+                workdir.replace('\\', '/'), infile)
 
             p = subprocess.Popen(
                 shlex.split(cmd),
@@ -71,7 +71,7 @@ def convert_file(self, infile, workdir, return_bytes=False):
             )
             sout, serr = p.communicate()
             # extract error line from sout
-            error_index, _ = extract(text=sout, expression="%s:(\d+)" % infile)
+            error_index, _ = extract(text=sout, expression=r"%s:(\d+)" % os.path.basename(infile))
             # extract success rendered equation
             if error_index != []:
                 # offset index start from 0, same as self.math
@@ -90,6 +90,8 @@ def convert_file(self, infile, workdir, return_bytes=False):
                 pdffile,
                 pngfile,
             )  # -bg Transparent -z 9
+            if sys.platform == 'win32':
+                cmd = 'magick ' + cmd
             p = subprocess.Popen(
                 shlex.split(cmd),
                 stdin=subprocess.PIPE,
diff --git a/pix2tex/dataset/render.py b/pix2tex/dataset/render.py
index 6ff69ae..9a34d37 100644
--- a/pix2tex/dataset/render.py
+++ b/pix2tex/dataset/render.py
@@ -9,17 +9,16 @@
 import cv2
 import numpy as np
 from PIL import Image
-import traceback
 import subprocess
-import shlex
 
 
 def get_installed_fonts(tex_path: str):
     cmd = "find %s -name *Math*.otf" % tex_path
-    process = subprocess.Popen(shlex.split(cmd),
+    process = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
-                               universal_newlines=True
+                               universal_newlines=True,
+                               shell=True
                                )
     stdout, stderr = process.communicate()
     if process.returncode != 0:
@@ -29,30 +28,28 @@ def get_installed_fonts(tex_path: str):
     return fonts
 
 
-def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args):
+def render_dataset(dataset: np.ndarray, unrendered: np.ndarray, args):
     '''Renders a list of tex equations
     Args:
         dataset (numpy.ndarray): List of equations
-        unrenders (numpy.ndarray): List of integers of size `dataset` that give the name of the saved image
+        unrendered (numpy.ndarray): List of integers of size `dataset` that give the name of the saved image
         args (Union[Namespace, Munch]): additional arguments: mode (equation or inline), out (output directory), divable (common factor )
                                         batchsize (how many samples to render at once), dpi, font (Math font), preprocess (crop, alpha off)
                                         shuffle (bool)
     Returns:
         list: equation indices that could not be rendered. 
     '''
-    assert len(unrenders) == len(
-        dataset), 'unrenders and dataset must be of equal size'
+    assert len(unrendered) == len(dataset), 'unrendered and dataset must be of equal size'
     math_mode = '$$'if args.mode == 'equation' else '$'
     os.makedirs(args.out, exist_ok=True)
     # remove successfully rendered equations
     rendered = np.array([int(os.path.basename(img).split('.')[0])
-                        for img in glob.glob(os.path.join(args.out, '*.png'))])
-    valid = [i for i, j in enumerate(unrenders) if j not in rendered]
-    # update unrenders and dataset
+                         for img in glob.glob(os.path.join(args.out, '*.png'))])
+    valid = [i for i, j in enumerate(unrendered) if j not in rendered]
+    # update unrendered and dataset
     dataset = dataset[valid]
-    unrenders = unrenders[valid]
-    order = np.random.permutation(
-        len(dataset)) if args.shuffle else np.arange(len(dataset))
+    unrendered = unrendered[valid]
+    order = np.random.permutation(len(dataset)) if args.shuffle else np.arange(len(dataset))
     faulty = []
     for batch_offset in tqdm(range(0, len(dataset), args.batchsize), desc="global batch index"):
         batch = dataset[order[batch_offset:batch_offset+args.batchsize]]
@@ -80,19 +77,18 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args):
                 # tranfer in batch index to global batch index
                 global_error_index = [
                     batch_offset+_ for _ in local_error_index]
-                faulty.extend(list(unrenders[order[global_error_index]]))
+                faulty.extend(list(unrendered[order[global_error_index]]))
             except Exception as e:
                 print("\n%s" % e, end='')
                 faulty.extend(
-                    list(unrenders[order[batch_offset:batch_offset+args.batchsize]]))
+                    list(unrendered[order[batch_offset:batch_offset+args.batchsize]]))
                 continue
 
             for inbatch_idx, order_idx in enumerate(range(batch_offset, batch_offset+args.batchsize)):
                 # exclude render failed equations and blank line
                 if inbatch_idx in local_error_index or inbatch_idx not in valid_idx:
                     continue
-                outpath = os.path.join(args.out, '%07d.png' %
-                                       unrenders[order[order_idx]])
+                outpath = os.path.join(args.out, '%07d.png' % unrendered[order[order_idx]])
                 png_idx = np.where(valid_idx == inbatch_idx)[0][0]
                 if args.preprocess:
                     try:
@@ -104,8 +100,7 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args):
                         # some png will be whole white, because some equation's syntax is wrong
                         # eg.$$ \mathit { \Iota \Kappa \Lambda \Mu \Nu \Xi \Omicron \Pi } $$
                         # extract from wikipedia english dump file https://dumps.wikimedia.org/enwiki/latest/
-                        white_percentage = (
-                            white_pixels / (gray.shape[0] * gray.shape[1]))
+                        white_percentage = (white_pixels / (gray.shape[0] * gray.shape[1]))
                         if white_percentage == 0:
                             continue
                         # Find all non-zero points (text)
@@ -113,13 +108,11 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args):
                         # Find minimum spanning bounding box
                         a, b, w, h = cv2.boundingRect(coords)
                         rect = data[b:b+h, a:a+w]
-                        im = Image.fromarray(
-                            (255-rect[..., -1]).astype(np.uint8)).convert('L')
+                        im = Image.fromarray((255-rect[..., -1]).astype(np.uint8)).convert('L')
                         dims = []
                         for x in [w, h]:
                             div, mod = divmod(x, args.divable)
-                            dims.append(
-                                args.divable*(div + (1 if mod > 0 else 0)))
+                            dims.append(args.divable*(div + (1 if mod > 0 else 0)))
                         padded = Image.new('L', dims, 255)
                         padded.paste(im, (0, 0, im.size[0], im.size[1]))
                         padded.save(outpath)
@@ -149,8 +142,7 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args):
                         default="/usr/local/texlive/", help='installed font path')
     parser.add_argument('-m', '--mode', choices=[
                         'inline', 'equation'], default='equation', help='render as inline or equation')
-    parser.add_argument(
-        '--dpi', type=int, default=[110, 170], nargs='+', help='dpi range to render in')
+    parser.add_argument('--dpi', type=int, default=[110, 170], nargs='+', help='dpi range to render in')
     parser.add_argument('-p', '--no-preprocess', dest='preprocess', default=True,
                         action='store_false', help='crop, remove alpha channel, padding')
     parser.add_argument('-d', '--divable', type=int, default=32,
@@ -162,11 +154,11 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args):
         args.fonts_path)
     print(args.font)
     dataset = np.array(open(args.data, 'r').read().split('\n'), dtype=object)
-    unrenders = np.arange(len(dataset))
+    unrendered = np.arange(len(dataset))
     failed = np.array([])
-    while unrenders.tolist() != failed.tolist():
-        failed = unrenders
-        unrenders = render_dataset(dataset[unrenders], unrenders, args)
-        if len(unrenders) < 50*args.batchsize:
+    while unrendered.tolist() != failed.tolist():
+        failed = unrendered
+        unrendered = render_dataset(dataset[unrendered], unrendered, args)
+        if len(unrendered) < 50*args.batchsize:
             args.batchsize = max([1, args.batchsize//2])
         args.shuffle = True

From 0f9764a361934871a0337458da7cf42ab58f1e72 Mon Sep 17 00:00:00 2001
From: Lukas Blecher <luk.blecher@gmail.com>
Date: Thu, 28 Apr 2022 16:26:41 +0200
Subject: [PATCH 6/8] add brackets, use arguments instead of model args

support single page output.
---
 pix2tex/cli.py               | 6 +++---
 pix2tex/dataset/latex2png.py | 8 +++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/pix2tex/cli.py b/pix2tex/cli.py
index e43e76a..7ae121f 100644
--- a/pix2tex/cli.py
+++ b/pix2tex/cli.py
@@ -173,8 +173,8 @@ def main():
                     ''')
                 continue
             elif ins in ['show', 'katex', 'no_resize']:
-                setattr(model.args, ins, not getattr(model.args, ins, False))
-                print('set %s to %s' % (ins, getattr(model.args, ins)))
+                setattr(arguments, ins, not getattr(arguments, ins, False))
+                print('set %s to %s' % (ins, getattr(arguments, ins)))
                 continue
             elif os.path.isfile(os.path.realpath(possible_file)):
                 file = possible_file
@@ -195,7 +195,7 @@ def main():
                     except:
                         pass
                 pred = model(img)
-                output_prediction(pred, model.args)
+                output_prediction(pred, arguments)
             except KeyboardInterrupt:
                 pass
             file = None
diff --git a/pix2tex/dataset/latex2png.py b/pix2tex/dataset/latex2png.py
index 28b7bf5..0fd7fb5 100644
--- a/pix2tex/dataset/latex2png.py
+++ b/pix2tex/dataset/latex2png.py
@@ -79,7 +79,7 @@ def convert_file(self, infile, workdir, return_bytes=False):
             # Convert the PDF file to PNG's
             pdffile = infile.replace('.tex', '.pdf')
             result, _ = extract(
-                text=sout, expression="Output written on %s \((.*)? pages\)" % pdffile)
+                text=sout, expression="Output written on %s \((\d+)? page" % pdffile)
             if int(result[0]) != len(self.math):
                 raise Exception('xelatex rendering error, generated %d formula\'s page, but the total number of formulas is %d.' % (
                     int(result[0]), len(self.math)))
@@ -118,6 +118,8 @@ def convert_file(self, infile, workdir, return_bytes=False):
                 else:
                     png = [(pngfile.replace('.png', '')+'.png')]
             return png, error_index
+        except Exception as e:
+            print(e)
         finally:
             # Cleanup temporaries
             basefile = infile.replace('.tex', '')
@@ -141,10 +143,10 @@ def tex2png(eq, **kwargs):
     return __cache[eq]
 
 
-def tex2pil(tex, **kwargs):
+def tex2pil(tex, error_index=False, **kwargs):
     pngs, error_index = Latex(tex, **kwargs).write(return_bytes=True)
     images = [Image.open(io.BytesIO(d)) for d in pngs]
-    return images, error_index if kwargs.get("error_index", False) else images
+    return (images, error_index) if error_index else images
 
 
 def extract(text, expression=None):

From e0d76ae065bc12c3163177263555b2474c583742 Mon Sep 17 00:00:00 2001
From: Lukas Blecher <luk.blecher@gmail.com>
Date: Thu, 28 Apr 2022 16:29:04 +0200
Subject: [PATCH 7/8] Update latex2png.py

---
 pix2tex/dataset/latex2png.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pix2tex/dataset/latex2png.py b/pix2tex/dataset/latex2png.py
index 0fd7fb5..7af27f4 100644
--- a/pix2tex/dataset/latex2png.py
+++ b/pix2tex/dataset/latex2png.py
@@ -143,10 +143,10 @@ def tex2png(eq, **kwargs):
     return __cache[eq]
 
 
-def tex2pil(tex, error_index=False, **kwargs):
+def tex2pil(tex, return_error_index=False, **kwargs):
     pngs, error_index = Latex(tex, **kwargs).write(return_bytes=True)
     images = [Image.open(io.BytesIO(d)) for d in pngs]
-    return (images, error_index) if error_index else images
+    return (images, error_index) if return_error_index else images
 
 
 def extract(text, expression=None):

From 4f7ba9fc5ea171ed470f2a0444c51cc14780a48c Mon Sep 17 00:00:00 2001
From: Lukas Blecher <luk.blecher@gmail.com>
Date: Thu, 28 Apr 2022 16:30:00 +0200
Subject: [PATCH 8/8] Update render.py

---
 pix2tex/dataset/render.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pix2tex/dataset/render.py b/pix2tex/dataset/render.py
index 9f0d921..e395bae 100644
--- a/pix2tex/dataset/render.py
+++ b/pix2tex/dataset/render.py
@@ -69,7 +69,7 @@ def render_dataset(dataset: np.ndarray, unrendered: np.ndarray, args):
             try:
                 if args.preprocess:
                     pngs, error_index = tex2pil(
-                        math, dpi=dpi, font=font, error_index=True)
+                        math, dpi=dpi, font=font, return_error_index=True)
                 else:
                     pngs, error_index = Latex(math, dpi=dpi, font=font).write(
                         return_bytes=False)