From 8f96ef690b0e0a34a4cd3ec2a6ae25f9ef3a5bcb Mon Sep 17 00:00:00 2001 From: Yuhang Tao <35098797+TITC@users.noreply.github.com> Date: Wed, 27 Apr 2022 18:16:34 +0800 Subject: [PATCH 1/8] render accelerate&small problem fix --- pix2tex/dataset/latex2png.py | 81 ++++++++++++---- pix2tex/dataset/render.py | 178 +++++++++++++++++++++++------------ 2 files changed, 177 insertions(+), 82 deletions(-) diff --git a/pix2tex/dataset/latex2png.py b/pix2tex/dataset/latex2png.py index 63060c4..222e269 100644 --- a/pix2tex/dataset/latex2png.py +++ b/pix2tex/dataset/latex2png.py @@ -6,7 +6,9 @@ import io import glob import tempfile +import shlex import subprocess +import traceback from PIL import Image @@ -27,6 +29,8 @@ def __init__(self, math, dpi=250, font='Latin Modern Math'): self.math = math self.dpi = dpi self.font = font + self.prefix_line = self.BASE.split("\n").index( + "%s") # used for calculate error formula index def write(self, return_bytes=False): # inline = bool(re.match('^\$[^$]*\$$', self.math)) and False @@ -39,8 +43,9 @@ def write(self, return_bytes=False): # print(document) f.write(document) - png = self.convert_file(texfile, workdir, return_bytes=return_bytes) - return png + png, error_index = self.convert_file( + texfile, workdir, return_bytes=return_bytes) + return png, error_index finally: if os.path.exists(texfile): @@ -53,32 +58,40 @@ def convert_file(self, infile, workdir, return_bytes=False): try: # Generate the PDF file - cmd = 'xelatex -halt-on-error -output-directory %s %s' % (workdir, infile) + # not stop on error line, but return error line index,index start from 1 + cmd = 'xelatex -interaction nonstopmode -file-line-error -output-directory %s %s' % ( + workdir, infile) p = subprocess.Popen( - cmd, - shell=True, + shlex.split(cmd), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + universal_newlines=True ) sout, serr = p.communicate() - # Something bad happened, abort - if p.returncode != 0: - raise Exception('latex error', serr, sout) - + # extract error line from sout + error_index, _ = extract(text=sout, expression="%s:(\d+)" % infile) + # extract success rendered equation + if error_index != []: + # offset index start from 0, same as self.math + error_index = [int(_)-self.prefix_line-1 for _ in error_index] # Convert the PDF file to PNG's pdffile = infile.replace('.tex', '.pdf') + result, _ = extract( + text=sout, expression="Output written on %s \((.*)? pages\)" % pdffile) + if int(result[0]) != len(self.math): + raise Exception('xelatex rendering error, generated %d formula\'s page, but the total number of formulas is %d.' % ( + int(result[0]), len(self.math))) pngfile = os.path.join(workdir, infile.replace('.tex', '.png')) - cmd = 'magick convert -density %i -colorspace gray %s -quality 90 %s' % ( + cmd = 'convert -density %i -colorspace gray %s -quality 90 %s' % ( self.dpi, pdffile, pngfile, ) # -bg Transparent -z 9 p = subprocess.Popen( - cmd, - shell=True, + shlex.split(cmd), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -86,18 +99,23 @@ def convert_file(self, infile, workdir, return_bytes=False): sout, serr = p.communicate() if p.returncode != 0: - raise Exception('PDFpng error', serr, cmd, os.path.exists(pdffile), os.path.exists(infile)) + raise Exception('PDFpng error', serr, cmd, os.path.exists( + pdffile), os.path.exists(infile)) if return_bytes: if len(self.math) > 1: - png = [open(pngfile.replace('.png', '')+'-%i.png' % i, 'rb').read() for i in range(len(self.math))] + png = [open(pngfile.replace('.png', '')+'-%i.png' % + i, 'rb').read() for i in range(len(self.math))] else: - png = [open(pngfile.replace('.png', '')+'.png', 'rb').read()] - return png + png = [open(pngfile.replace( + '.png', '')+'.png', 'rb').read()] else: + # return path if len(self.math) > 1: - return [(pngfile.replace('.png', '')+'-%i.png' % i) for i in range(len(self.math))] + png = [(pngfile.replace('.png', '')+'-%i.png' % i) + for i in range(len(self.math))] else: - return (pngfile.replace('.png', '')+'.png') + png = [(pngfile.replace('.png', '')+'.png')] + return png, error_index finally: # Cleanup temporaries basefile = infile.replace('.tex', '') @@ -122,9 +140,32 @@ def tex2png(eq, **kwargs): def tex2pil(tex, **kwargs): - pngs = Latex(tex, **kwargs).write(return_bytes=True) + pngs, error_index = Latex(tex, **kwargs).write(return_bytes=True) images = [Image.open(io.BytesIO(d)) for d in pngs] - return images + return images, error_index + + +def extract(text, expression=None, type: str = None): + """extract text from text by regular expression + + Args: + text (str): input text + expression (str, optional): regular expression. Defaults to None. + type (str, optional): type of extracted text. Defaults to None. + + Returns: + str: extracted text + """ + if type is not None: + type2expression = {"en": r"[a-zA-Z]+", "zh": r"[\u4e00-\u9fa5]+", "num": r"\d+", + "punctuation": u"[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]"} + expression = type2expression[type] + try: + pattern = re.compile(expression) + results = re.findall(pattern, text) + return results, True if len(results) != 0 else False + except Exception: + traceback.print_exc() if __name__ == '__main__': diff --git a/pix2tex/dataset/render.py b/pix2tex/dataset/render.py index a1790bf..6ff69ae 100644 --- a/pix2tex/dataset/render.py +++ b/pix2tex/dataset/render.py @@ -1,4 +1,5 @@ -from pix2tex.dataset.latex2png import * + +from pix2tex.dataset.latex2png import Latex, tex2pil import argparse import sys import os @@ -7,112 +8,165 @@ from tqdm.auto import tqdm import cv2 import numpy as np +from PIL import Image +import traceback +import subprocess +import shlex -def render_dataset(dataset: np.ndarray, names: np.ndarray, args): - '''Renders a list of tex equations +def get_installed_fonts(tex_path: str): + cmd = "find %s -name *Math*.otf" % tex_path + process = subprocess.Popen(shlex.split(cmd), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True + ) + stdout, stderr = process.communicate() + if process.returncode != 0: + raise Exception(stderr) + fonts = [_.split(os.sep)[-1] for _ in stdout.split('\n')][:-1] + fonts.append("Latin Modern Math") + return fonts + +def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args): + '''Renders a list of tex equations Args: dataset (numpy.ndarray): List of equations - names (numpy.ndarray): List of integers of size `dataset` that give the name of the saved image + unrenders (numpy.ndarray): List of integers of size `dataset` that give the name of the saved image args (Union[Namespace, Munch]): additional arguments: mode (equation or inline), out (output directory), divable (common factor ) batchsize (how many samples to render at once), dpi, font (Math font), preprocess (crop, alpha off) shuffle (bool) - Returns: list: equation indices that could not be rendered. ''' - assert len(names) == len(dataset), 'names and dataset must be of equal size' + assert len(unrenders) == len( + dataset), 'unrenders and dataset must be of equal size' math_mode = '$$'if args.mode == 'equation' else '$' os.makedirs(args.out, exist_ok=True) - indices = np.array([int(os.path.basename(img).split('.')[0]) for img in glob.glob(os.path.join(args.out, '*.png'))]) - - valid = [i for i, j in enumerate(names) if j not in indices] + # remove successfully rendered equations + rendered = np.array([int(os.path.basename(img).split('.')[0]) + for img in glob.glob(os.path.join(args.out, '*.png'))]) + valid = [i for i, j in enumerate(unrenders) if j not in rendered] + # update unrenders and dataset dataset = dataset[valid] - names = names[valid] - order = np.random.permutation(len(dataset)) if args.shuffle else np.arange(len(dataset)) + unrenders = unrenders[valid] + order = np.random.permutation( + len(dataset)) if args.shuffle else np.arange(len(dataset)) faulty = [] - for i in tqdm(range(0, len(dataset), args.batchsize)): - batch = dataset[order[i:i+args.batchsize]] + for batch_offset in tqdm(range(0, len(dataset), args.batchsize), desc="global batch index"): + batch = dataset[order[batch_offset:batch_offset+args.batchsize]] #batch = [x for j, x in enumerate(batch) if order[i+j] not in indices] if len(batch) == 0: continue - math = [math_mode+x+math_mode for x in batch if x != ''] + valid_math = np.asarray([[i, "%s %s %s" % (math_mode, x, math_mode)] for i, x in enumerate( + batch) if x != ''], dtype=object) # space used to prevent escape $ #print('\n', i, len(math), '\n'.join(math)) - if len(args.font) > 1: - font = np.random.choice(args.font) - else: - font = args.font[0] - if len(args.dpi) > 1: - dpi = np.random.choice(np.arange(min(args.dpi), max(args.dpi))) - else: - dpi = args.dpi[0] - if len(math) > 0: + font = font = np.random.choice(args.font) if len( + args.font) > 1 else args.font[0] + dpi = np.random.choice(np.arange(min(args.dpi), max(args.dpi))) if len( + args.dpi) > 1 else args.dpi[0] + if len(valid_math) > 0: + valid_idx, math = valid_math.T + valid_idx = valid_idx.astype(np.int32) try: if args.preprocess: - pngs = tex2pil(math, dpi=dpi, font=font) + pngs, error_index = tex2pil(math, dpi=dpi, font=font) else: - pngs = Latex(math, dpi=dpi, font=font).write(return_bytes=False) + pngs, error_index = Latex(math, dpi=dpi, font=font).write( + return_bytes=False) + # error_index not count "" line, use valid_idx transfer to real index matching in batch index + local_error_index = valid_idx[error_index] + # tranfer in batch index to global batch index + global_error_index = [ + batch_offset+_ for _ in local_error_index] + faulty.extend(list(unrenders[order[global_error_index]])) except Exception as e: - #print(e) - #print(math) - #raise e - faulty.extend(list(names[order[i:i+args.batchsize]])) + print("\n%s" % e, end='') + faulty.extend( + list(unrenders[order[batch_offset:batch_offset+args.batchsize]])) continue - for j, k in enumerate(range(i, i+len(pngs))): - outpath = os.path.join(args.out, '%07d.png' % names[order[k]]) + for inbatch_idx, order_idx in enumerate(range(batch_offset, batch_offset+args.batchsize)): + # exclude render failed equations and blank line + if inbatch_idx in local_error_index or inbatch_idx not in valid_idx: + continue + outpath = os.path.join(args.out, '%07d.png' % + unrenders[order[order_idx]]) + png_idx = np.where(valid_idx == inbatch_idx)[0][0] if args.preprocess: try: - data = np.asarray(pngs[j]) + data = np.asarray(pngs[png_idx]) # print(data.shape) - gray = 255*(data[..., 0] < 128).astype(np.uint8) # To invert the text to white - coords = cv2.findNonZero(gray) # Find all non-zero points (text) - a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box + # To invert the text to white + gray = 255*(data[..., 0] < 128).astype(np.uint8) + white_pixels = np.sum(gray == 255) + # some png will be whole white, because some equation's syntax is wrong + # eg.$$ \mathit { \Iota \Kappa \Lambda \Mu \Nu \Xi \Omicron \Pi } $$ + # extract from wikipedia english dump file https://dumps.wikimedia.org/enwiki/latest/ + white_percentage = ( + white_pixels / (gray.shape[0] * gray.shape[1])) + if white_percentage == 0: + continue + # Find all non-zero points (text) + coords = cv2.findNonZero(gray) + # Find minimum spanning bounding box + a, b, w, h = cv2.boundingRect(coords) rect = data[b:b+h, a:a+w] - im = Image.fromarray((255-rect[..., -1]).astype(np.uint8)).convert('L') + im = Image.fromarray( + (255-rect[..., -1]).astype(np.uint8)).convert('L') dims = [] for x in [w, h]: div, mod = divmod(x, args.divable) - dims.append(args.divable*(div + (1 if mod > 0 else 0))) + dims.append( + args.divable*(div + (1 if mod > 0 else 0))) padded = Image.new('L', dims, 255) - padded.paste(im, im.getbbox()) + padded.paste(im, (0, 0, im.size[0], im.size[1])) padded.save(outpath) except Exception as e: print(e) pass else: - shutil.move(pngs[j], outpath) - + shutil.move(pngs[png_idx], outpath) + # prevent repeat between two error_index and imagemagic error + faulty = list(set(faulty)) + faulty.sort() return np.array(faulty) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Render dataset') - parser.add_argument('-i', '--data', type=str, required=True, help='file of list of latex code') - parser.add_argument('-o', '--out', type=str, required=True, help='output directory') - parser.add_argument('-b', '--batchsize', type=int, default=100, help='How many equations to render at once') - parser.add_argument('-f', '--font', nargs='+', type=str, default=['Latin Modern Math', 'GFSNeohellenicMath.otf', 'Asana Math', 'XITS Math', - 'Cambria Math', 'Latin Modern Math', 'Latin Modern Math', 'Latin Modern Math'], help='font to use. default = Latin Modern Math') - parser.add_argument('-m', '--mode', choices=['inline', 'equation'], default='equation', help='render as inline or equation') - parser.add_argument('--dpi', type=int, default=[110, 170], nargs='+', help='dpi range to render in') - parser.add_argument('-p', '--no-preprocess', dest='preprocess', default=True, action='store_false', help='crop, remove alpha channel, padding') - parser.add_argument('-d', '--divable', type=int, default=32, help='To what factor to pad the images') - parser.add_argument('-s', '--shuffle', action='store_true', help='Whether to shuffle the equations in the first iteration') + parser.add_argument('-i', '--data', type=str, + required=True, help='file of list of latex code') + parser.add_argument('-o', '--out', type=str, + required=True, help='output directory') + parser.add_argument('-b', '--batchsize', type=int, default=100, + help='How many equations to render at once') + parser.add_argument('-f', '--font', nargs='+', type=str, + default="", help='font to use.') + parser.add_argument('-fp', '--fonts_path', type=str, + default="/usr/local/texlive/", help='installed font path') + parser.add_argument('-m', '--mode', choices=[ + 'inline', 'equation'], default='equation', help='render as inline or equation') + parser.add_argument( + '--dpi', type=int, default=[110, 170], nargs='+', help='dpi range to render in') + parser.add_argument('-p', '--no-preprocess', dest='preprocess', default=True, + action='store_false', help='crop, remove alpha channel, padding') + parser.add_argument('-d', '--divable', type=int, default=32, + help='To what factor to pad the images') + parser.add_argument('-s', '--shuffle', action='store_true', + help='Whether to shuffle the equations in the first iteration') args = parser.parse_args(sys.argv[1:]) - + args.font = args.font if args.font != "" else get_installed_fonts( + args.fonts_path) + print(args.font) dataset = np.array(open(args.data, 'r').read().split('\n'), dtype=object) - names = np.arange(len(dataset)) - prev_names = None - for i in range(12): - if len(names) == 0: - break - prev_names = names - names = render_dataset(dataset[names], names, args) - same = names == prev_names - if (type(same) == bool and same) or (type(same) == np.ndarray and same.all()) or (args.batchsize == 1): - break - if len(names) < 50*args.batchsize: + unrenders = np.arange(len(dataset)) + failed = np.array([]) + while unrenders.tolist() != failed.tolist(): + failed = unrenders + unrenders = render_dataset(dataset[unrenders], unrenders, args) + if len(unrenders) < 50*args.batchsize: args.batchsize = max([1, args.batchsize//2]) args.shuffle = True From 1a9a922091217e17942b9835909367c241283e79 Mon Sep 17 00:00:00 2001 From: Yuhang Tao <35098797+TITC@users.noreply.github.com> Date: Thu, 28 Apr 2022 20:56:45 +0800 Subject: [PATCH 2/8] delete useless code ```python if type is not None: type2expression = {"en": r"[a-zA-Z]+", "zh": r"[\u4e00-\u9fa5]+", "num": r"\d+", "punctuation": u"[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]"} expression = type2expression[type] ``` --- pix2tex/dataset/latex2png.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pix2tex/dataset/latex2png.py b/pix2tex/dataset/latex2png.py index 222e269..03a6b6a 100644 --- a/pix2tex/dataset/latex2png.py +++ b/pix2tex/dataset/latex2png.py @@ -145,21 +145,16 @@ def tex2pil(tex, **kwargs): return images, error_index -def extract(text, expression=None, type: str = None): +def extract(text, expression=None): """extract text from text by regular expression Args: text (str): input text expression (str, optional): regular expression. Defaults to None. - type (str, optional): type of extracted text. Defaults to None. Returns: str: extracted text """ - if type is not None: - type2expression = {"en": r"[a-zA-Z]+", "zh": r"[\u4e00-\u9fa5]+", "num": r"\d+", - "punctuation": u"[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]"} - expression = type2expression[type] try: pattern = re.compile(expression) results = re.findall(pattern, text) From b40a2934d899389a05bc48c597071ddec9275f9c Mon Sep 17 00:00:00 2001 From: TITC Date: Thu, 28 Apr 2022 21:47:49 +0800 Subject: [PATCH 3/8] fix break cli problem --- pix2tex/dataset/latex2png.py | 2 +- pix2tex/dataset/render.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pix2tex/dataset/latex2png.py b/pix2tex/dataset/latex2png.py index 03a6b6a..4c1f866 100644 --- a/pix2tex/dataset/latex2png.py +++ b/pix2tex/dataset/latex2png.py @@ -142,7 +142,7 @@ def tex2png(eq, **kwargs): def tex2pil(tex, **kwargs): pngs, error_index = Latex(tex, **kwargs).write(return_bytes=True) images = [Image.open(io.BytesIO(d)) for d in pngs] - return images, error_index + return images, error_index if kwargs.get("error_index", False) else images def extract(text, expression=None): diff --git a/pix2tex/dataset/render.py b/pix2tex/dataset/render.py index 6ff69ae..3b7fad2 100644 --- a/pix2tex/dataset/render.py +++ b/pix2tex/dataset/render.py @@ -71,7 +71,7 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args): valid_idx = valid_idx.astype(np.int32) try: if args.preprocess: - pngs, error_index = tex2pil(math, dpi=dpi, font=font) + pngs, error_index = tex2pil(math, dpi=dpi, font=font, error_index=True) else: pngs, error_index = Latex(math, dpi=dpi, font=font).write( return_bytes=False) From 0776fd643c834cb73847564d2aa88783cd38a0bf Mon Sep 17 00:00:00 2001 From: TITC Date: Thu, 28 Apr 2022 21:56:23 +0800 Subject: [PATCH 4/8] Increase the weight of LM fonts --- pix2tex/dataset/render.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pix2tex/dataset/render.py b/pix2tex/dataset/render.py index 3b7fad2..27bbac9 100644 --- a/pix2tex/dataset/render.py +++ b/pix2tex/dataset/render.py @@ -25,7 +25,7 @@ def get_installed_fonts(tex_path: str): if process.returncode != 0: raise Exception(stderr) fonts = [_.split(os.sep)[-1] for _ in stdout.split('\n')][:-1] - fonts.append("Latin Modern Math") + fonts.extend(["Latin Modern Math"]*len(fonts)) return fonts @@ -71,7 +71,8 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args): valid_idx = valid_idx.astype(np.int32) try: if args.preprocess: - pngs, error_index = tex2pil(math, dpi=dpi, font=font, error_index=True) + pngs, error_index = tex2pil( + math, dpi=dpi, font=font, error_index=True) else: pngs, error_index = Latex(math, dpi=dpi, font=font).write( return_bytes=False) From c469cec890367b89dac70d4c8b842aa37a0de404 Mon Sep 17 00:00:00 2001 From: Lukas Blecher Date: Thu, 28 Apr 2022 16:06:15 +0200 Subject: [PATCH 5/8] `unerenders` to `unerendered` Add windows support: - use shell in get_installed_fonts - resolve backslash and `convert` to `magick convert` --- pix2tex/dataset/latex2png.py | 8 ++++-- pix2tex/dataset/render.py | 54 +++++++++++++++--------------------- 2 files changed, 28 insertions(+), 34 deletions(-) diff --git a/pix2tex/dataset/latex2png.py b/pix2tex/dataset/latex2png.py index 222e269..96acb37 100644 --- a/pix2tex/dataset/latex2png.py +++ b/pix2tex/dataset/latex2png.py @@ -55,12 +55,12 @@ def write(self, return_bytes=False): pass def convert_file(self, infile, workdir, return_bytes=False): - + infile = infile.replace('\\', '/') try: # Generate the PDF file # not stop on error line, but return error line index,index start from 1 cmd = 'xelatex -interaction nonstopmode -file-line-error -output-directory %s %s' % ( - workdir, infile) + workdir.replace('\\', '/'), infile) p = subprocess.Popen( shlex.split(cmd), @@ -71,7 +71,7 @@ def convert_file(self, infile, workdir, return_bytes=False): ) sout, serr = p.communicate() # extract error line from sout - error_index, _ = extract(text=sout, expression="%s:(\d+)" % infile) + error_index, _ = extract(text=sout, expression=r"%s:(\d+)" % os.path.basename(infile)) # extract success rendered equation if error_index != []: # offset index start from 0, same as self.math @@ -90,6 +90,8 @@ def convert_file(self, infile, workdir, return_bytes=False): pdffile, pngfile, ) # -bg Transparent -z 9 + if sys.platform == 'win32': + cmd = 'magick ' + cmd p = subprocess.Popen( shlex.split(cmd), stdin=subprocess.PIPE, diff --git a/pix2tex/dataset/render.py b/pix2tex/dataset/render.py index 6ff69ae..9a34d37 100644 --- a/pix2tex/dataset/render.py +++ b/pix2tex/dataset/render.py @@ -9,17 +9,16 @@ import cv2 import numpy as np from PIL import Image -import traceback import subprocess -import shlex def get_installed_fonts(tex_path: str): cmd = "find %s -name *Math*.otf" % tex_path - process = subprocess.Popen(shlex.split(cmd), + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - universal_newlines=True + universal_newlines=True, + shell=True ) stdout, stderr = process.communicate() if process.returncode != 0: @@ -29,30 +28,28 @@ def get_installed_fonts(tex_path: str): return fonts -def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args): +def render_dataset(dataset: np.ndarray, unrendered: np.ndarray, args): '''Renders a list of tex equations Args: dataset (numpy.ndarray): List of equations - unrenders (numpy.ndarray): List of integers of size `dataset` that give the name of the saved image + unrendered (numpy.ndarray): List of integers of size `dataset` that give the name of the saved image args (Union[Namespace, Munch]): additional arguments: mode (equation or inline), out (output directory), divable (common factor ) batchsize (how many samples to render at once), dpi, font (Math font), preprocess (crop, alpha off) shuffle (bool) Returns: list: equation indices that could not be rendered. ''' - assert len(unrenders) == len( - dataset), 'unrenders and dataset must be of equal size' + assert len(unrendered) == len(dataset), 'unrendered and dataset must be of equal size' math_mode = '$$'if args.mode == 'equation' else '$' os.makedirs(args.out, exist_ok=True) # remove successfully rendered equations rendered = np.array([int(os.path.basename(img).split('.')[0]) - for img in glob.glob(os.path.join(args.out, '*.png'))]) - valid = [i for i, j in enumerate(unrenders) if j not in rendered] - # update unrenders and dataset + for img in glob.glob(os.path.join(args.out, '*.png'))]) + valid = [i for i, j in enumerate(unrendered) if j not in rendered] + # update unrendered and dataset dataset = dataset[valid] - unrenders = unrenders[valid] - order = np.random.permutation( - len(dataset)) if args.shuffle else np.arange(len(dataset)) + unrendered = unrendered[valid] + order = np.random.permutation(len(dataset)) if args.shuffle else np.arange(len(dataset)) faulty = [] for batch_offset in tqdm(range(0, len(dataset), args.batchsize), desc="global batch index"): batch = dataset[order[batch_offset:batch_offset+args.batchsize]] @@ -80,19 +77,18 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args): # tranfer in batch index to global batch index global_error_index = [ batch_offset+_ for _ in local_error_index] - faulty.extend(list(unrenders[order[global_error_index]])) + faulty.extend(list(unrendered[order[global_error_index]])) except Exception as e: print("\n%s" % e, end='') faulty.extend( - list(unrenders[order[batch_offset:batch_offset+args.batchsize]])) + list(unrendered[order[batch_offset:batch_offset+args.batchsize]])) continue for inbatch_idx, order_idx in enumerate(range(batch_offset, batch_offset+args.batchsize)): # exclude render failed equations and blank line if inbatch_idx in local_error_index or inbatch_idx not in valid_idx: continue - outpath = os.path.join(args.out, '%07d.png' % - unrenders[order[order_idx]]) + outpath = os.path.join(args.out, '%07d.png' % unrendered[order[order_idx]]) png_idx = np.where(valid_idx == inbatch_idx)[0][0] if args.preprocess: try: @@ -104,8 +100,7 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args): # some png will be whole white, because some equation's syntax is wrong # eg.$$ \mathit { \Iota \Kappa \Lambda \Mu \Nu \Xi \Omicron \Pi } $$ # extract from wikipedia english dump file https://dumps.wikimedia.org/enwiki/latest/ - white_percentage = ( - white_pixels / (gray.shape[0] * gray.shape[1])) + white_percentage = (white_pixels / (gray.shape[0] * gray.shape[1])) if white_percentage == 0: continue # Find all non-zero points (text) @@ -113,13 +108,11 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args): # Find minimum spanning bounding box a, b, w, h = cv2.boundingRect(coords) rect = data[b:b+h, a:a+w] - im = Image.fromarray( - (255-rect[..., -1]).astype(np.uint8)).convert('L') + im = Image.fromarray((255-rect[..., -1]).astype(np.uint8)).convert('L') dims = [] for x in [w, h]: div, mod = divmod(x, args.divable) - dims.append( - args.divable*(div + (1 if mod > 0 else 0))) + dims.append(args.divable*(div + (1 if mod > 0 else 0))) padded = Image.new('L', dims, 255) padded.paste(im, (0, 0, im.size[0], im.size[1])) padded.save(outpath) @@ -149,8 +142,7 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args): default="/usr/local/texlive/", help='installed font path') parser.add_argument('-m', '--mode', choices=[ 'inline', 'equation'], default='equation', help='render as inline or equation') - parser.add_argument( - '--dpi', type=int, default=[110, 170], nargs='+', help='dpi range to render in') + parser.add_argument('--dpi', type=int, default=[110, 170], nargs='+', help='dpi range to render in') parser.add_argument('-p', '--no-preprocess', dest='preprocess', default=True, action='store_false', help='crop, remove alpha channel, padding') parser.add_argument('-d', '--divable', type=int, default=32, @@ -162,11 +154,11 @@ def render_dataset(dataset: np.ndarray, unrenders: np.ndarray, args): args.fonts_path) print(args.font) dataset = np.array(open(args.data, 'r').read().split('\n'), dtype=object) - unrenders = np.arange(len(dataset)) + unrendered = np.arange(len(dataset)) failed = np.array([]) - while unrenders.tolist() != failed.tolist(): - failed = unrenders - unrenders = render_dataset(dataset[unrenders], unrenders, args) - if len(unrenders) < 50*args.batchsize: + while unrendered.tolist() != failed.tolist(): + failed = unrendered + unrendered = render_dataset(dataset[unrendered], unrendered, args) + if len(unrendered) < 50*args.batchsize: args.batchsize = max([1, args.batchsize//2]) args.shuffle = True From 0f9764a361934871a0337458da7cf42ab58f1e72 Mon Sep 17 00:00:00 2001 From: Lukas Blecher Date: Thu, 28 Apr 2022 16:26:41 +0200 Subject: [PATCH 6/8] add brackets, use arguments instead of model args support single page output. --- pix2tex/cli.py | 6 +++--- pix2tex/dataset/latex2png.py | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pix2tex/cli.py b/pix2tex/cli.py index e43e76a..7ae121f 100644 --- a/pix2tex/cli.py +++ b/pix2tex/cli.py @@ -173,8 +173,8 @@ def main(): ''') continue elif ins in ['show', 'katex', 'no_resize']: - setattr(model.args, ins, not getattr(model.args, ins, False)) - print('set %s to %s' % (ins, getattr(model.args, ins))) + setattr(arguments, ins, not getattr(arguments, ins, False)) + print('set %s to %s' % (ins, getattr(arguments, ins))) continue elif os.path.isfile(os.path.realpath(possible_file)): file = possible_file @@ -195,7 +195,7 @@ def main(): except: pass pred = model(img) - output_prediction(pred, model.args) + output_prediction(pred, arguments) except KeyboardInterrupt: pass file = None diff --git a/pix2tex/dataset/latex2png.py b/pix2tex/dataset/latex2png.py index 28b7bf5..0fd7fb5 100644 --- a/pix2tex/dataset/latex2png.py +++ b/pix2tex/dataset/latex2png.py @@ -79,7 +79,7 @@ def convert_file(self, infile, workdir, return_bytes=False): # Convert the PDF file to PNG's pdffile = infile.replace('.tex', '.pdf') result, _ = extract( - text=sout, expression="Output written on %s \((.*)? pages\)" % pdffile) + text=sout, expression="Output written on %s \((\d+)? page" % pdffile) if int(result[0]) != len(self.math): raise Exception('xelatex rendering error, generated %d formula\'s page, but the total number of formulas is %d.' % ( int(result[0]), len(self.math))) @@ -118,6 +118,8 @@ def convert_file(self, infile, workdir, return_bytes=False): else: png = [(pngfile.replace('.png', '')+'.png')] return png, error_index + except Exception as e: + print(e) finally: # Cleanup temporaries basefile = infile.replace('.tex', '') @@ -141,10 +143,10 @@ def tex2png(eq, **kwargs): return __cache[eq] -def tex2pil(tex, **kwargs): +def tex2pil(tex, error_index=False, **kwargs): pngs, error_index = Latex(tex, **kwargs).write(return_bytes=True) images = [Image.open(io.BytesIO(d)) for d in pngs] - return images, error_index if kwargs.get("error_index", False) else images + return (images, error_index) if error_index else images def extract(text, expression=None): From e0d76ae065bc12c3163177263555b2474c583742 Mon Sep 17 00:00:00 2001 From: Lukas Blecher Date: Thu, 28 Apr 2022 16:29:04 +0200 Subject: [PATCH 7/8] Update latex2png.py --- pix2tex/dataset/latex2png.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pix2tex/dataset/latex2png.py b/pix2tex/dataset/latex2png.py index 0fd7fb5..7af27f4 100644 --- a/pix2tex/dataset/latex2png.py +++ b/pix2tex/dataset/latex2png.py @@ -143,10 +143,10 @@ def tex2png(eq, **kwargs): return __cache[eq] -def tex2pil(tex, error_index=False, **kwargs): +def tex2pil(tex, return_error_index=False, **kwargs): pngs, error_index = Latex(tex, **kwargs).write(return_bytes=True) images = [Image.open(io.BytesIO(d)) for d in pngs] - return (images, error_index) if error_index else images + return (images, error_index) if return_error_index else images def extract(text, expression=None): From 4f7ba9fc5ea171ed470f2a0444c51cc14780a48c Mon Sep 17 00:00:00 2001 From: Lukas Blecher Date: Thu, 28 Apr 2022 16:30:00 +0200 Subject: [PATCH 8/8] Update render.py --- pix2tex/dataset/render.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pix2tex/dataset/render.py b/pix2tex/dataset/render.py index 9f0d921..e395bae 100644 --- a/pix2tex/dataset/render.py +++ b/pix2tex/dataset/render.py @@ -69,7 +69,7 @@ def render_dataset(dataset: np.ndarray, unrendered: np.ndarray, args): try: if args.preprocess: pngs, error_index = tex2pil( - math, dpi=dpi, font=font, error_index=True) + math, dpi=dpi, font=font, return_error_index=True) else: pngs, error_index = Latex(math, dpi=dpi, font=font).write( return_bytes=False)