From 590f7f181738ace224793ddddf9c9bcab2dfe59b Mon Sep 17 00:00:00 2001 From: Jokcer <519548295@qq.com> Date: Fri, 22 Nov 2024 21:19:15 +0800 Subject: [PATCH 1/5] fix: fix char blank --- lineless_table_rec/utils_table_recover.py | 2 +- wired_table_rec/utils_table_recover.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lineless_table_rec/utils_table_recover.py b/lineless_table_rec/utils_table_recover.py index fbb2cbb..6a66038 100644 --- a/lineless_table_rec/utils_table_recover.py +++ b/lineless_table_rec/utils_table_recover.py @@ -289,7 +289,7 @@ def gather_ocr_list_by_row(ocr_list: List[Any], thehold: float = 0.2) -> List[An cur[0], next[0], axis="y", threhold=thehold ) if c_idx: - dis = max(next_box[0] - cur_box[0], 0) + dis = max(next_box[0] - cur_box[2], 0) blank_str = int(dis / threshold) * " " cur[1] = cur[1] + blank_str + next[1] xmin = min(cur_box[0], next_box[0]) diff --git a/wired_table_rec/utils_table_recover.py b/wired_table_rec/utils_table_recover.py index 1b71d84..caf5d2c 100644 --- a/wired_table_rec/utils_table_recover.py +++ b/wired_table_rec/utils_table_recover.py @@ -262,7 +262,7 @@ def plot_rec_box_with_logic_info(img_path, output_path, logic_points, sorted_pol y1 = round(y1) cv2.rectangle(img, (x0, y0), (x1, y1), (0, 0, 255), 1) # 增大字体大小和线宽 - font_scale = 0.7 # 原先是0.5 + font_scale = 0.9 # 原先是0.5 thickness = 1 # 原先是1 logic_point = logic_points[idx] cv2.putText( @@ -309,13 +309,13 @@ def plot_rec_box(img_path, output_path, sorted_polygons): y1 = round(y1) cv2.rectangle(img, (x0, y0), (x1, y1), (0, 0, 255), 1) # 增大字体大小和线宽 - font_scale = 1.0 # 原先是0.5 - thickness = 2 # 原先是1 + font_scale = 0.9 # 原先是0.5 + thickness = 1 # 原先是1 cv2.putText( img, str(idx), - (x1, y1), + (x0 + 5, y0 + 5), cv2.FONT_HERSHEY_PLAIN, font_scale, (0, 0, 255), @@ -392,7 +392,7 @@ def gather_ocr_list_by_row(ocr_list: List[Any], threhold: float = 0.2) -> List[A cur[0], next[0], axis="y", threhold=threhold ) if c_idx: - dis = max(next_box[0] - cur_box[0], 0) + dis = max(next_box[0] - cur_box[2], 0) blank_str = int(dis / threshold) * " " cur[1] = cur[1] + blank_str + next[1] xmin = min(cur_box[0], next_box[0]) From ed661822490262379a69a8deec79c9af65c3c60a Mon Sep 17 00:00:00 2001 From: Jokcer <519548295@qq.com> Date: Fri, 22 Nov 2024 22:05:13 +0800 Subject: [PATCH 2/5] feat: adapt rapid ocr char rec --- README.md | 34 ++++++++++++------- README_en.md | 41 +++++++++++++---------- lineless_table_rec/utils_table_recover.py | 13 +++++++ wired_table_rec/table_line_rec_plus.py | 9 ++--- wired_table_rec/utils_table_recover.py | 13 +++++++ 5 files changed, 76 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 841e886..b9e0eeb 100644 --- a/README.md +++ b/README.md @@ -15,16 +15,16 @@ ### 最近更新 -- **2024.10.22** - - 补充复杂背景多表格检测提取方案[RapidTableDet](https://github.com/RapidAI/RapidTableDetection) - **2024.11.12** - - 抽离模型识别和处理过程核心阈值,方便大家进行微调适配自己的场景[微调入参参考](#核心参数) + - 抽离模型识别和处理过程核心阈值,方便大家进行微调适配自己的场景[输入参数](#核心参数) - **2024.11.16** - - 补充文档扭曲矫正方案,可作为前置处理 [文档扭曲变形修正](https://github.com/Joker1212/RapidUnWrap) + - 补充文档扭曲矫正方案,可作为前置处理 [RapidUnwrap](https://github.com/Joker1212/RapidUnWrap) +- **2024.11.22** + - 支持单字符匹配方案,需要RapidOCR>=1.4.0 ### 简介 💖该仓库是用来对文档中表格做结构化识别的推理库,包括来自阿里读光有线和无线表格识别模型,llaipython(微信)贡献的有线表格模型,网易Qanything内置表格分类模型等。\ -[快速开始](#安装) [模型评测](#指标结果) [使用建议](#使用建议) [文档扭曲变形修正](https://github.com/Joker1212/RapidUnWrap) [表格旋转及透视修正](#表格旋转及透视修正) [微调入参参考](#核心参数) [常见问题](#FAQ) [更新计划](#更新计划) +[快速开始](#安装) [模型评测](#指标结果) [使用建议](#使用建议) [单字匹配](#单字ocr匹配) [文档扭曲修正](https://github.com/Joker1212/RapidUnWrap) [表格旋转及透视修正](#表格旋转及透视修正) [输入参数](#核心参数) [常见问题](#FAQ) [更新计划](#更新计划) #### 特点 ⚡ **快** 采用ONNXRuntime作为推理引擎,cpu下单图推理1-7s @@ -106,7 +106,6 @@ print(f"elasp: {elasp}") # 使用其他ocr模型 #ocr_engine =RapidOCR(det_model_dir="xxx/det_server_infer.onnx",rec_model_dir="xxx/rec_server_infer.onnx") #ocr_res, _ = ocr_engine(img_path) -#html, elasp, polygons, logic_points, ocr_res = table_engine(img_path, ocr_result=ocr_res) # output_dir = f'outputs' # complete_html = format_html(html) @@ -121,6 +120,17 @@ print(f"elasp: {elasp}") # plot_rec_box(img_path, f"{output_dir}/ocr_box.jpg", ocr_res) ``` +#### 单字ocr匹配 +```python +# 将单字box转换为行识别同样的结构) +from rapidocr_onnxruntime import RapidOCR +from wired_table_rec.utils_table_recover import trans_char_ocr_res +img_path = "tests/test_files/wired/table4.jpg" +ocr_engine =RapidOCR() +ocr_res, _ = ocr_engine(img_path, return_word_box=True) +ocr_res = trans_char_ocr_res(ocr_res) +``` + #### 表格旋转及透视修正 ##### 1.简单背景,小角度场景 ```python @@ -165,19 +175,17 @@ for i, res in enumerate(result): ```python wired_table_rec = WiredTableRecognition() html, elasp, polygons, logic_points, ocr_res = wired_table_rec( - img_path, + img, # 图片 Union[str, np.ndarray, bytes, Path, PIL.Image.Image] + ocr_result, # 输入rapidOCR识别结果,不传默认使用内部rapidocr模型 version="v2", #默认使用v2线框模型,切换阿里读光模型可改为v1 - morph_close=True, # 是否进行形态学操作,辅助找到更多线框,默认为True - more_h_lines=True, # 是否基于线框检测结果进行更多水平线检查,辅助找到更小线框, 默认为True - h_lines_threshold = 100, # 必须开启more_h_lines, 连接横线检测像素阈值,小于该值会生成新横线,默认为100 - more_v_lines=True, # 是否基于线框检测结果进行更多垂直线检查,辅助找到更小线框, 默认为True - v_lines_threshold = 15, # 必须开启more_v_lines, 连接竖线检测像素阈值,小于该值会生成新竖线,默认为15 - extend_line=True, # 是否基于线框检测结果进行线段延长,辅助找到更多线框, 默认为True + enhance_box_line=True, # 识别框切割增强(关闭避免多余切割,开启减少漏切割),默认为True need_ocr=True, # 是否进行OCR识别, 默认为True rec_again=True,# 是否针对未识别到文字的表格框,进行单独截取再识别,默认为True ) lineless_table_rec = LinelessTableRecognition() html, elasp, polygons, logic_points, ocr_res = lineless_table_rec( + img, # 图片 Union[str, np.ndarray, bytes, Path, PIL.Image.Image] + ocr_result, # 输入rapidOCR识别结果,不传默认使用内部rapidocr模型 need_ocr=True, # 是否进行OCR识别, 默认为True rec_again=True,# 是否针对未识别到文字的表格框,进行单独截取再识别,默认为True ) diff --git a/README_en.md b/README_en.md index 6dc7ecf..5a22b3d 100644 --- a/README_en.md +++ b/README_en.md @@ -13,17 +13,16 @@ ### Recent Updates -- **2024.10.22** - - Added the complex background multi-table detection and extraction solution [RapidTableDet](https://github.com/RapidAI/RapidTableDetection). - - **2024.11.12** - Extracted model recognition and processing core thresholds for easier fine-tuning according to specific scenarios. See [Core Parameters](#core-parameters). - **2024.11.16** - - Added document distortion correction solution, which can be used as a pre-processing step [Document Distortion Correction](https://github.com/Joker1212/RapidUnWrap) + - Added document distortion correction solution, which can be used as a pre-processing step [RapidUnWrap](https://github.com/Joker1212/RapidUnWrap) +- **2024.11.22** + - Support Char Rec, RapidOCR>=1.4.0 [RapidUnWrap](https://github.com/Joker1212/RapidUnWrap) ### Introduction 💖 This repository serves as an inference library for structured recognition of tables within documents, including models for wired and wireless table recognition from Alibaba DulaLight, a wired table model from llaipython (WeChat), and a built-in table classification model from NetEase Qanything. -[Quick Start](#installation) [Model Evaluation](#evaluation-results) [Usage Recommendations](#usage-recommendations) [Document Distortion Correction](https://github.com/Joker1212/RapidUnWrap) [Table Rotation & Perspective Correction](#table-rotation-and-perspective-correction) [Fine-tuning Input Parameters Reference](#core-parameters) [Frequently Asked Questions](#faqs) [Update Plan](#update-plan) +[Quick Start](#installation) [Model Evaluation](#evaluation-results) [Char Rec](#Single-Character-OCR-Matching) [Usage Recommendations](#usage-recommendations) [Document Distortion Correction](https://github.com/Joker1212/RapidUnWrap) [Table Rotation & Perspective Correction](#table-rotation-and-perspective-correction) [Input Parameters](#core-parameters) [Frequently Asked Questions](#faqs) [Update Plan](#update-plan) #### Features ⚡ **Fast:** Uses ONNXRuntime as the inference engine, achieving 1-7 seconds per image on CPU. @@ -121,6 +120,16 @@ print(f"elasp: {elasp}") # Visualize OCR recognition boxes # plot_rec_box(img_path, f"{output_dir}/ocr_box.jpg", ocr_res) ``` +#### Single Character OCR Matching +```python +# Convert single character boxes to the same structure as line recognition +from rapidocr_onnxruntime import RapidOCR +from wired_table_rec.utils_table_recover import trans_char_ocr_res +img_path = "tests/test_files/wired/table4.jpg" +ocr_engine =RapidOCR() +ocr_res, _ = ocr_engine(img_path, return_word_box=True) +ocr_res = trans_char_ocr_res(ocr_res) +``` #### Table Rotation and Perspective Correction ##### 1. Simple Background, Small Angle Scene @@ -166,21 +175,19 @@ for i, res in enumerate(result): ```python wired_table_rec = WiredTableRecognition() html, elasp, polygons, logic_points, ocr_res = wired_table_rec( - img_path, - version="v2", # Default to use v2 line model, switch to Alibaba ReadLight model by changing to v1 - morph_close=True,# Whether to perform morphological operations to find more lines, default is True - more_h_lines=True, # Whether to check for more horizontal lines based on line detection results to find smaller lines, default is True - h_lines_threshold = 100, # Must enable more_h_lines, threshold for connecting horizontal line detection pixels, new horizontal lines will be generated if below this value, default is 100 - more_v_lines=True, # Whether to check for more vertical lines based on line detection results to find smaller lines, default is True - v_lines_threshold = 15, # Must enable more_v_lines, threshold for connecting vertical line detection pixels, new vertical lines will be generated if below this value, default is 15 - extend_line=True, # Whether to extend line segments based on line detection results to find more lines, default is True - need_ocr=True, # Whether to perform OCR recognition, default is True - rec_again=True,# Whether to re-recognize table boxes that were not recognized, default is True + img, # Image Union[str, np.ndarray, bytes, Path, PIL.Image.Image] + ocr_result, # Input rapidOCR recognition result, use internal rapidocr model by default if not provided + version="v2", # Default to using v2 line model, switch to AliDamo model by changing to v1 + enhance_box_line=True, # Enhance box line find (turn off to avoid excessive cutting, turn on to reduce missed cuts), default is True + need_ocr=True, # Whether to perform OCR recognition, default is True + rec_again=True, # Whether to re-recognize table boxes without detected text by cropping them separately, default is True ) lineless_table_rec = LinelessTableRecognition() html, elasp, polygons, logic_points, ocr_res = lineless_table_rec( - need_ocr=True, # Whether to perform OCR recognition, default is True - rec_again=True, # Whether to re-recognize table boxes that were not recognized, default is True + img, # Image Union[str, np.ndarray, bytes, Path, PIL.Image.Image] + ocr_result, # Input rapidOCR recognition result, use internal rapidocr model by default if not provided + need_ocr=True, # Whether to perform OCR recognition, default is True + rec_again=True, # Whether to re-recognize table boxes without detected text by cropping them separately, default is True ) ``` diff --git a/lineless_table_rec/utils_table_recover.py b/lineless_table_rec/utils_table_recover.py index 6a66038..262226f 100644 --- a/lineless_table_rec/utils_table_recover.py +++ b/lineless_table_rec/utils_table_recover.py @@ -605,6 +605,19 @@ def format_html(html): """ +def trans_char_ocr_res(ocr_res): + word_result = [] + for res in ocr_res: + score = res[2] + for word_box, word in zip(res[3], res[4]): + word_res = [] + word_res.append(word_box) + word_res.append(word) + word_res.append(score) + word_result.append(word_res) + return word_result + + def get_rotate_crop_image(img: np.ndarray, points: np.ndarray) -> np.ndarray: img_crop_width = int( max( diff --git a/wired_table_rec/table_line_rec_plus.py b/wired_table_rec/table_line_rec_plus.py index 2800102..7ea6c2a 100644 --- a/wired_table_rec/table_line_rec_plus.py +++ b/wired_table_rec/table_line_rec_plus.py @@ -73,17 +73,18 @@ def postprocess(self, img, pred, **kwargs): h_lines_threshold = kwargs.get("h_lines_threshold", 100) if kwargs else 100 v_lines_threshold = kwargs.get("v_lines_threshold", 15) if kwargs else 15 angle = kwargs.get("angle", 50) if kwargs else 50 + enhance_box_line = kwargs.get("enhance_box_line") if kwargs else True morph_close = ( - kwargs.get("morph_close", True) if kwargs else True + kwargs.get("morph_close", enhance_box_line) if kwargs else enhance_box_line ) # 是否进行闭合运算以找到更多小的框 more_h_lines = ( - kwargs.get("more_h_lines", True) if kwargs else True + kwargs.get("more_h_lines", enhance_box_line) if kwargs else enhance_box_line ) # 是否调整以找到更多的横线 more_v_lines = ( - kwargs.get("more_v_lines", True) if kwargs else True + kwargs.get("more_v_lines", enhance_box_line) if kwargs else enhance_box_line ) # 是否调整以找到更多的横线 extend_line = ( - kwargs.get("extend_line", True) if kwargs else True + kwargs.get("extend_line", enhance_box_line) if kwargs else enhance_box_line ) # 是否进行线段延长使得端点连接 ori_shape = img.shape diff --git a/wired_table_rec/utils_table_recover.py b/wired_table_rec/utils_table_recover.py index caf5d2c..235c39e 100644 --- a/wired_table_rec/utils_table_recover.py +++ b/wired_table_rec/utils_table_recover.py @@ -288,6 +288,19 @@ def plot_rec_box_with_logic_info(img_path, output_path, logic_points, sorted_pol cv2.imwrite(output_path, img) +def trans_char_ocr_res(ocr_res): + word_result = [] + for res in ocr_res: + score = res[2] + for word_box, word in zip(res[3], res[4]): + word_res = [] + word_res.append(word_box) + word_res.append(word) + word_res.append(score) + word_result.append(word_res) + return word_result + + def plot_rec_box(img_path, output_path, sorted_polygons): """ :param img_path From ed4722ba2de178296a4e8af2d965114de79d4b47 Mon Sep 17 00:00:00 2001 From: Jokcer <519548295@qq.com> Date: Fri, 22 Nov 2024 22:42:42 +0800 Subject: [PATCH 3/5] test: add enhance_box_line param test --- README_en.md | 2 +- tests/test_wired_table_rec.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README_en.md b/README_en.md index 5a22b3d..e9ffe11 100644 --- a/README_en.md +++ b/README_en.md @@ -18,7 +18,7 @@ - **2024.11.16** - Added document distortion correction solution, which can be used as a pre-processing step [RapidUnWrap](https://github.com/Joker1212/RapidUnWrap) - **2024.11.22** - - Support Char Rec, RapidOCR>=1.4.0 [RapidUnWrap](https://github.com/Joker1212/RapidUnWrap) + - Support Char Rec, RapidOCR>=1.4.0 ### Introduction 💖 This repository serves as an inference library for structured recognition of tables within documents, including models for wired and wireless table recognition from Alibaba DulaLight, a wired table model from llaipython (WeChat), and a built-in table classification model from NetEase Qanything. diff --git a/tests/test_wired_table_rec.py b/tests/test_wired_table_rec.py index 2e0d782..de1a43b 100644 --- a/tests/test_wired_table_rec.py +++ b/tests/test_wired_table_rec.py @@ -68,17 +68,17 @@ def test_input_normal(img_path, gt_td_nums, gt2): @pytest.mark.parametrize( "img_path, gt_td_nums", [ - ("wired_big_box.png", 70), + ("wired_big_box.png", 44), ], ) -def test_input_normal(img_path, gt_td_nums): +def test_enhance_box_line(img_path, gt_td_nums): img_path = test_file_dir / img_path ocr_result, _ = ocr_engine(img_path) - table_str, *_ = table_recog(str(img_path), ocr_result) + table_str, *_ = table_recog(str(img_path), ocr_result, enhance_box_line=False) td_nums = get_td_nums(table_str) - assert td_nums >= gt_td_nums + assert td_nums <= gt_td_nums @pytest.mark.parametrize( From 42f92ad859c30c7379889f8b0d5780bccb198165 Mon Sep 17 00:00:00 2001 From: Jokcer <519548295@qq.com> Date: Fri, 22 Nov 2024 23:38:33 +0800 Subject: [PATCH 4/5] fix: fix rec_again param test case --- tests/test_wired_table_rec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_wired_table_rec.py b/tests/test_wired_table_rec.py index de1a43b..1d44622 100644 --- a/tests/test_wired_table_rec.py +++ b/tests/test_wired_table_rec.py @@ -285,7 +285,7 @@ def test_plot_html_table(logi_points, cell_box_map, expected_html): @pytest.mark.parametrize( "img_path, gt_td_nums, gt2", [ - ("table_recognition.jpg", 35, "d colsp"), + ("table_recognition.jpg", 20, "d colsp"), ], ) def test_no_rec_again(img_path, gt_td_nums, gt2): From 2519d98f5afcbcb3b80f8131eac16403818f75f4 Mon Sep 17 00:00:00 2001 From: Jokcer <519548295@qq.com> Date: Fri, 22 Nov 2024 23:46:24 +0800 Subject: [PATCH 5/5] chore: update readme --- README.md | 3 +-- README_en.md | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b9e0eeb..ac9df3c 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,6 @@ wired_table_rec_v2(有线表格精度最高): 通用场景有线表格(论文,杂志,期刊, 收据,单据,账单) paddlex-SLANet-plus(综合精度最高): 文档场景表格(论文,杂志,期刊中的表格) -[微调入参参考](#核心参数) ### 安装 @@ -106,7 +105,7 @@ print(f"elasp: {elasp}") # 使用其他ocr模型 #ocr_engine =RapidOCR(det_model_dir="xxx/det_server_infer.onnx",rec_model_dir="xxx/rec_server_infer.onnx") #ocr_res, _ = ocr_engine(img_path) - +#html, elasp, polygons, logic_points, ocr_res = table_engine(img_path, ocr_result=ocr_res) # output_dir = f'outputs' # complete_html = format_html(html) # os.makedirs(os.path.dirname(f"{output_dir}/table.html"), exist_ok=True) diff --git a/README_en.md b/README_en.md index e9ffe11..c2fc24e 100644 --- a/README_en.md +++ b/README_en.md @@ -22,7 +22,7 @@ ### Introduction 💖 This repository serves as an inference library for structured recognition of tables within documents, including models for wired and wireless table recognition from Alibaba DulaLight, a wired table model from llaipython (WeChat), and a built-in table classification model from NetEase Qanything. -[Quick Start](#installation) [Model Evaluation](#evaluation-results) [Char Rec](#Single-Character-OCR-Matching) [Usage Recommendations](#usage-recommendations) [Document Distortion Correction](https://github.com/Joker1212/RapidUnWrap) [Table Rotation & Perspective Correction](#table-rotation-and-perspective-correction) [Input Parameters](#core-parameters) [Frequently Asked Questions](#faqs) [Update Plan](#update-plan) +[Quick Start](#installation) [Model Evaluation](#evaluation-results) [Char Rec](#Single-Character-OCR-Matching) [Usage Recommendations](#usage-recommendations) [Document Distortion Correction](https://github.com/Joker1212/RapidUnWrap) [Table Rotation & Perspective Correction](#table-rotation-and-perspective-correction) [Input Parameters](#core-parameters) [Frequently Asked Questions](#FAQ) [Update Plan](#update-plan) #### Features ⚡ **Fast:** Uses ONNXRuntime as the inference engine, achieving 1-7 seconds per image on CPU. @@ -70,7 +70,7 @@ Surya-Tabled uses its built-in OCR module, which is a row-column recognition mod ### Usage Recommendations wired_table_rec_v2 (highest precision for wired tables): General scenes for wired tables (papers, magazines, journals, receipts, invoices, bills) -paddlex-SLANet-plus (highest overall precision): Document scene tables (tables in papers, magazines, and journals) [Fine-tuning Input Parameters Reference](#core-parameters) +paddlex-SLANet-plus (highest overall precision): Document scene tables (tables in papers, magazines, and journals) ### Installation