main
HuangHai 1 month ago
commit a72954a90e

36
.gitignore vendored

@ -0,0 +1,36 @@
# Byte-compiled / optimized / DLL files
__pycache__/
.ipynb_checkpoints/
*.py[cod]
*$py.class
# C extensions
*.so
inference/
inference_results/
output/
train_data/
log/
*.DS_Store
*.vs
*.user
*~
*.vscode
*.idea
*.log
.clang-format
.clang_format.hook
build/
dist/
paddleocr.egg-info/
/deploy/android_demo/app/OpenCV/
/deploy/android_demo/app/PaddleLite/
/deploy/android_demo/app/.cxx/
/deploy/android_demo/app/cache/
test_tipc/web/models/
test_tipc/web/node_modules/
Images/
ImagesSmall/

@ -0,0 +1,45 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: check-added-large-files
args: ['--maxkb=512']
- id: check-case-conflict
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
- id: end-of-file-fixer
- id: trailing-whitespace
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|py)$
- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.5.5
hooks:
- id: remove-crlf
- id: remove-tabs
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|py)$
- repo: local
hooks:
- id: clang-format
name: clang-format
description: Format files with ClangFormat
entry: bash .clang_format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
# For Python files
- repo: https://github.com/psf/black.git
rev: 24.10.0
hooks:
- id: black
files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
# Flake8
- repo: https://github.com/pycqa/flake8
rev: 7.1.1
hooks:
- id: flake8
args:
- --count
- --select=E9,F63,F7,F82,E721
- --show-source
- --statistics
exclude: ^benchmark/|^test_tipc/

@ -0,0 +1,3 @@
[style]
based_on_style = pep8
column_limit = 80

@ -0,0 +1,18 @@
# 基础路径配置
basePath = r'D:\dsWork\PaddleOCR'
# PDF文件配置
source_pdf = basePath + r'\大数据研究苏轼\[047.中国古典文学基本丛书.苏轼词编年校注].王宗堂,邹同庆撰.扫描版.pdf'
# 图片处理配置
images_dir = f"{basePath}\\Images"
compressed_images_dir = f"{basePath}\\ImagesSmall"
target_width = 587 # 目标宽度默认587像素
image_quality = 85 # JPEG压缩质量
# 时间格式
time_format = '%Y-%m-%d %H:%M:%S'
# OCR结果输出配置
ocr_output_dir = f"{basePath}\\KeCheng\\Txt"
markdown_output_dir = f"{basePath}\\KeCheng\\Txt"

@ -0,0 +1,15 @@
import datetime
import fitz # PyMuPDF
# 导入配置
from Config import source_pdf, images_dir, time_format
# 打开PDF文件
pdf = fitz.open(source_pdf)
for page_num in range(len(pdf)):
page = pdf.load_page(page_num)
pix = page.get_pixmap(dpi=300)
pix.save(f"{images_dir}\\page_{page_num}.png")
# 输出当前时间,格式:年月日时分秒 和 提示信息
print(f"{datetime.datetime.now().strftime(time_format)} 正在处理第{page_num}")

@ -0,0 +1,66 @@
import os
import time
from PIL import Image
# 导入配置
from Config import target_width, image_quality, compressed_images_dir
def process_image(img_path, target_width=target_width):
"""
处理图像
:param img_path: 输入图像路径
:param target_width: 目标宽度默认587像素根据用户提供的好用图片尺寸
:param output_path: 输出结果的文件路径可选
:param raw_output_path: 原始OCR结果的输出文件路径可选
:return: OCR识别结果
"""
start_time = time.time()
# 图片预处理 - 缩小图片尺寸到指定宽度
img = Image.open(img_path)
width, height = img.size
print(f"原始图片尺寸: {width}x{height}")
# 计算缩放比例,保持纵横比
scale = target_width / width
new_width = target_width
new_height = int(height * scale)
print(f"目标图片尺寸: {new_width}x{new_height}")
# 缩放图像
resized_img = img.resize((new_width, new_height), Image.LANCZOS)
# 确保压缩图片目录存在
os.makedirs(compressed_images_dir, exist_ok=True)
# 获取原始文件名并创建新的输出路径
file_name = os.path.basename(img_path)
output_name = os.path.splitext(file_name)[0] + "_compressed.jpg"
output_path = os.path.join(compressed_images_dir, output_name)
# 保存缩小后的图片
resized_img.convert('RGB').save(output_path, 'JPEG', quality=image_quality, optimize=True)
print(f"图片已缩小: {width}x{height} -> {new_width}x{new_height}")
# 释放内存
del resized_img
img.close()
del img
if __name__ == "__main__":
import glob
from Config import images_dir
# 处理Images目录下的所有PNG图片
image_files = glob.glob(os.path.join(images_dir, "page_*.png"))
print(f"找到{len(image_files)}个图片文件需要处理")
for i, img_path in enumerate(image_files):
print(f"[{i+1}/{len(image_files)}] 处理图片: {os.path.basename(img_path)}")
process_image(img_path)
print(f"所有图片处理完成,压缩后的图片保存在: {compressed_images_dir}")

@ -0,0 +1,45 @@
import os
import glob
import datetime
from paddleocr import PPStructureV3
# 导入配置
from Config import compressed_images_dir, ocr_output_dir, markdown_output_dir, time_format
# 确保输出目录存在
os.makedirs(ocr_output_dir, exist_ok=True)
os.makedirs(markdown_output_dir, exist_ok=True)
# 初始化OCR引擎
pipeline = PPStructureV3()
# 获取所有压缩后的图片
image_files = glob.glob(os.path.join(compressed_images_dir, "page_*_compressed.jpg"))
image_files.sort(key=lambda x: int(os.path.basename(x).split('_')[1]))
print(f"找到{len(image_files)}个图片文件需要进行OCR识别")
# 处理每个图片
for i, img_path in enumerate(image_files):
# 从文件名中提取页码
page_num = os.path.basename(img_path).split('_')[1]
# 检查目标目录是否已存在
markdown_save_path = os.path.join(markdown_output_dir, f"page_{page_num}")
if os.path.exists(markdown_save_path):
print(f"{datetime.datetime.now().strftime(time_format)}{page_num}页的OCR结果已存在跳过处理")
continue
# 输出处理信息
print(f"{datetime.datetime.now().strftime(time_format)} 正在处理第{page_num}页的OCR识别")
# 执行OCR识别
output = pipeline.predict(img_path)
# 使用save_to_markdown方法保存结果
for res in output:
res.save_to_markdown(save_path=markdown_save_path)
print(f"{page_num}页OCR识别完成结果已保存到: {markdown_save_path}")
print(f"所有图片OCR识别完成结果保存在: {markdown_output_dir}")

@ -0,0 +1,66 @@
[{'input_path': 'D:\\dsWork\\PaddleOCR\\ImagesSmall\\page_0_compressed.jpg', 'page_index': None, 'doc_preprocessor_res': {'output_img': array([[[104, ..., 217],
...,
[ 48, ..., 56]],
...,
[[ 21, ..., 32],
...,
[ 14, ..., 19]]], shape=(862, 587, 3), dtype=uint8)}, 'layout_det_res': {'input_path': None, 'page_index': None, 'input_img': array([[[104, ..., 217],
...,
[ 48, ..., 56]],
...,
[[ 21, ..., 32],
...,
[ 14, ..., 19]]], shape=(862, 587, 3), dtype=uint8), 'boxes': [{'cls_id': 1, 'label': 'image', 'score': 0.5590275526046753, 'coordinate': [np.float32(146.0109), np.float32(63.905975), np.float32(259.0317), np.float32(180.34189)]}, {'cls_id': 2, 'label': 'text', 'score': 0.4184643626213074, 'coordinate': [np.float32(155.71976), np.float32(3.9154816), np.float32(514.49243), np.float32(34.9183)]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.39416977763175964, 'coordinate': [np.float32(84.17284), np.float32(231.3527), np.float32(140.27222), np.float32(686.1651)]}]}, 'region_det_res': {'input_path': None, 'page_index': None, 'input_img': array([[[104, ..., 217],
...,
[ 48, ..., 56]],
...,
[[ 21, ..., 32],
...,
[ 14, ..., 19]]], shape=(862, 587, 3), dtype=uint8), 'boxes': [{'cls_id': 0, 'label': 'Region', 'score': 0.9720134139060974, 'coordinate': array([ 84.17284, ..., 686.1651 ], shape=(4,), dtype=float32)}]}, 'overall_ocr_res': {'input_path': None, 'page_index': None, 'doc_preprocessor_res': {'output_img': array([[[104, ..., 217],
...,
[ 48, ..., 56]],
...,
[[ 21, ..., 32],
...,
[ 14, ..., 19]]], shape=(862, 587, 3), dtype=uint8)}, 'dt_polys': [array([[155, 7],
...,
[156, 34]], shape=(4, 2), dtype=int16), array([[ 85, 234],
...,
[ 85, 685]], shape=(4, 2), dtype=int16)], 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': True}, 'text_det_params': {'limit_side_len': 736, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'text_rec_score_thresh': 0.0, 'rec_texts': ['中國古典文學基本叢書', '蘇轼詞编年校注'], 'rec_scores': [0.8946498036384583, 0.7676457762718201], 'rec_polys': [array([[155, 7],
...,
[156, 34]], shape=(4, 2), dtype=int16), array([[ 85, 234],
...,
[ 85, 685]], shape=(4, 2), dtype=int16)], 'textline_orientation_angles': [0, 0], 'rec_boxes': array([[155, ..., 34],
[ 85, ..., 685]], shape=(2, 4), dtype=int16), 'rec_labels': ['text', 'text']}, 'table_res_list': [], 'seal_res_list': [], 'chart_res_list': [], 'formula_res_list': [], 'parsing_res_list': [
#################
index: 1
label: text
region_label: normal_text
bbox: [155, 3, 514, 34]
content: 中國古典文學基本叢書
#################,
#################
index: 2
label: doc_title
region_label: doc_title
bbox: [84, 231, 140, 686]
content: 蘇轼詞编年校注
#################,
#################
index: 0
label: image
region_label: vision
bbox: [146, 63, 259, 180]
content:
#################], 'imgs_in_doc': [{'path': 'imgs/img_in_image_box_146_63_259_180.jpg', 'img': <PIL.Image.Image image mode=RGB size=113x117 at 0x25466A74910>, 'coordinate': (146, 63, 259, 180), 'score': 0.5590275526046753}], 'model_settings': {'use_doc_preprocessor': False, 'use_seal_recognition': True, 'use_table_recognition': True, 'use_formula_recognition': True, 'use_chart_recognition': False, 'use_region_detection': True}}]

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.9 KiB

@ -0,0 +1,5 @@
中國古典文學基本叢書
# 蘇轼詞编年校注
<div style="text-align: center;"><img src="imgs/img_in_image_box_146_63_259_180.jpg" alt="Image" width="19%" /></div>

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

@ -0,0 +1,6 @@
中国古典文學基本叢書
<div style="text-align: center;"><img src="imgs/img_in_image_box_150_66_263_182.jpg" alt="Image" width="19%" /></div>
蘇轼詞编年校注

@ -0,0 +1,19 @@
蘇轼铜编年校註
寫景、記,以及愛國思想農村生活說理談禪等等,都是晚唐五代以來詞人反映較少或完全没有涉及工時,成功的具有豪放風格作品的出現,也有利於詞的内容的繼續擴大。從蘇軾開始,通過他的實践,終
豪情,他在(沁圆春》(孤館燈青)中说:
正是由於有這樣的自信和自豪,才寫出了[妥帖排的豪放之作。也正因為如此,次年在密州出獵時,店英勇豪行為下阕進一步寫道:
酒酣胸膽尚開张赞微霜,又何妨。持節雲中,何日遣馮唐?會挽雕弓如满月,西北望,射天
狼!
把狩獵時的豪情上升為保衛邊疆殺敵報國的激昂壯志,成為豪放詞的代表作大概與此同時,他又寫手

@ -0,0 +1,11 @@
沙〉(怪見眉间一點黄),其下闕云:
贊揚梁左藏奉詔從軍的光榮。同時還有一首(南鄉子》(旌旆满江湖),寫道:
等愛國詞人導夫先路,至辛棄疾而臻其極,厥功甚偉。
,,宗讀至[瓊樓玉宇」二句,嘆日:「蘇軾終是愛君。以之相參,劉說是深刻的。
蘇軾在登臨懷古詞中也流露出同類的心情。這一题材始于王安石的(桂枝香》〔金陵懷古]蘇軾於元豐五年在黄州貶所寫的(念奴嬌》赤壁懷古],歷來舉為豪放詞的代表作。黄夢國云:「题是赤壁,手手手手(即[故國三句從周郎拍到自己,撫古傷今,抒發自己遭誣被貶,白首無成的沉重苦悶。其下雖有消極情緒,但從他對周瑜的讚美欽慕中仍然可以看到他要求為國建功立業的爱國熱情。辛棄疾曾四次和韻①

@ -0,0 +1,13 @@
蘇轼詞编年校註
類抒寫愛國主義思想的作品是應該大書特書的。
蘇軾對题材意境的另一重大開拓是把農村生活引人詞中。他在徐州太守任上寫了(浣溪沙》組詞,中中自中稱是此中人的使君,並寫出了使君和農民親密融洽的關係。這一組農村生活詞為詞的表現内容開关了(西江月〉(明月别枝驚鹊)等名篇,都是這组詞的缴承。
(中中

@ -0,0 +1,11 @@
萬頃風濤不記蘇,雪晴江上麥千車。但令人飽我愁無。
(景蘇圖帖》第五石刻此詞,首句下注:[公田在蘇州,今年風潮蕩盡。據此,詞的意思是說:儘管我蘇現了作者忘記自身困苦,只對人民的生計關切。這種精神境界使我們想起杜甫的(茅屋為秋風所破歌〉和白居易的(新製布裘》詩,這不但是他以前詞人中所未見,他以後的詞人中也是没有的
之作。上阕寫十年來對其亡妻一直難忘的思念和自己仕途的凄凉下関寫幽夢還鄉,夢中相見,宛如平生。醒來想到如亡妻有靈,一定在年年月明之夜,为思念千里之外尘面赞霜的丈夫而柔腸寸断。通((友的(感皇恩(案上敷编書)°
中風清,一枕初寒夢不成。今夜残燈斜照處,熒熒。秋雨晴時淚不晴表現别後思念之深又如寫别李

@ -0,0 +1,7 @@
蘇轼詞编年校注
,器我情至意切,給人以强烈的悵邑之感。尤其是以下四首,更有高度的藝術特色與藝術見解:(水龍境,空靈中雜以凄。過片始言情,有滄波浩渺之致,真高格也。[雲夢二句,妙能寫聞中情景。煞拍,车(歸朝歡〉(我夢扁舟浮震澤),這是紹聖元年(一〇九四)蘇軾貶惠州途中,行至九江,與蘇堅分别時所創作新詞。向民歌學習的進步思想和見解,在當時有劃時代的意義。
和他的詩一樣,蘇軾有許多抒發個人感情和歌詠自然景物的優美詞作。在元豐二年(一九七)烏(関「沙河塘裹燈初上,水調誰家唱?夜闌風静欲歸時,惟有一江明月碧琉璃。寫出了杭州的風物之美。(望江南》(春未老)寫春天登密州超然臺所見到的[風细柳絲斜「半壕春水一城花,煙雨暗千家的動

@ -0,0 +1 @@
人景色。而在赴湖州途中的(南鄉子〉「雨暗初疑夜,風回便報晴,淡雲斜照著山明。细草軟沙溪路、,但是,儒家人世的思想始終居於主導地位,因而没有走向消極颓廢的道路。即如他的(卜算子(缺月掛江仙》(夜飲東坡醒復醉),有衝破黑暗、走向自由的熱望。(水調歌頭》(落日繡簾捲),面對快哉亭下的的思想。他在(浣溪沙〉(山下蘭芽短浸溪)中,看到清泉寺門前溪水西流,就想到人生也能再少,青春可中蘇軾於[技藜徐步中消磨壯志,與辛棄疾同調詞(枕簟溪堂冷欲秋)的在「一丘一壑中消磨壯志,用意無乃相似。其中[紅蓮白鳥二句,当非偶合。即在他六十四歲謫居儋耳所寫的(减字木蘭花》(春牛

@ -0,0 +1,17 @@
蘇轼詞编年校注
人衰憊之氣。
杜甫以議論為詩,詞壇上王安石始見端倪,如(浪淘沙令〉(伊吕兩衰翁)雨霖鈴〉(孜孜矻矻)望大如(水調歌頭》(明月幾時有),王閱運說:[人有三句,大開大闔之筆,他人所不能。永遇樂〉(明月如,,無迹矣。二是通篇議論。如〈满庭芳》(蝸角虚名)減字木蘭花〉(賢哉令尹)(如夢令〉(水垢何曾相受)(自净方能净彼)等皆是,遂開後人以詞論文論政論禪、論道之先河。
總之,詞至蘇軾,其體始尊。其思想性和藝術性不僅超越前人,亦有後人所未及者。雄篇奇製,照
耀寰宇,若李杜之於詩歌,韓柳之於文章,蔚為大宗,影響深遠。元好問云:「自東坡一出,情性之外,不
知有文字,真有一洗萬古凡馬空氣象。誠非過言。
鄒同慶王宗堂二同志致力蘇詞研究,從事編年箋註,引證時事,比檢史籍,力求言之有據。注释中请中者之良友编纂既成,屬為弁言。聊述管窺,以供参考一九九一年高文

@ -0,0 +1,11 @@
凡例
(,,()一()(),(),,「箋註」參考資料」等欄目誤人蘇集詞只列[考辨不校不註。
二、本書正编文字,以天津國書館藏天一阁鈔明吴訥编唐宋名賢百家詞》本(東坡詞》三卷為底本(下吴本)以北京圖書館藏清鈔本宋傅幹(注坡詞》十二卷(下称傅本)元延祐庚申刊(东坡樂府)二卷(下稱元本)明茅維编(蘇東坡全集》本(東坡詞》二卷(下稱明刊全集)焦编(蘇長公二妙集)本(彊村叢書》本(東坡樂府》三卷(下稱朱本)龍榆生(東坡樂府箋)三卷(下稱龍本)唐圭璋编(全宋)記詞話、詞譜等以定是非凡此,均在校記中説明。
三校勘力求簡明凡底本不誤而校本誤者不出校凡底本不誤而校本異文有参考價值者,出異文校

@ -0,0 +1,15 @@
蘇轼詞编年校註
一五七首,拾遗四十首,計得三一一首。凡他本有吴本無者,均予採録。其中確為蘇詞者,予以增中
。比,以便比較。资料搜集範图,自宋至清。近代当代有代表性的考評,酌情選録。
八、附録墓誌銘、傳記(蘇軾詞集版本綜述》序跋等,供研究蘇詞者參考。九、為便於讀者翻檢查尋,另编蘇詞篇目索引附在書末。
校註者
一九八八年五月

@ -0,0 +1,25 @@
## 目録
蘇軾詞编年校註正編
一、蘇軾编年詞二九二首
宋英宗治平元年甲辰(一〇六四年)
熙寧四年辛亥(一〇七一年)
南歌子(縮雙蟠)……九
日鲜
## 又琥珀腰佩…11
熙寧五年壬子(一〇七二年)
浪淘沙(昨日出東城)…一四浣溪沙(徐邈能中酒聖賢)……一五雙荷葉(雙月)…一八
荷花媚霞苞薯………1
熙寧六年癸丑(一〇七三年)
()()瑞麟鸠(城頭月落尚啼鳥)…二八江城子(鳳凰山下雨初晴)……三菩薩蠻(簾高傾出)……三五

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.8 KiB

@ -0,0 +1,5 @@
中國古典文學基本叢書
# 蘇軾詞编年校注
<div style="text-align: center;"><img src="imgs/img_in_image_box_147_67_261_184.jpg" alt="Image" width="19%" /></div>

@ -0,0 +1,9 @@
## 蘇轼詞编年校注
瑞鹉鸪(碧山影裹小紅旗)川……臨江仙(四大從來都遍滿)………四〇江城子(玉人家在鳳凰山)…四二熙寧七年甲寅(一〇七四年)
行香子(手江村)減字木蘭花(來風細)……四八昭君怨(誰作桓伊三弄)……四九卜算子(蜀客到江南)……五二蝶戀花(雨春容清更麗)………五四占春芳(红杏了)…五六醉落魄(輕雲微月)…五八少年遊(去年相送)4…減字木蘭花雙龍對……六二鵲橋仙山仙子………六五虞美人湖山信是東南美………六七訴衷情錢塘風景古來奇……九
菩薩蠻(玉童西迓浮丘伯)……七一减字木蘭花(雲傾倒)……七四菩薩蠻(娟娟缺月西南落)……七六江城子(翠蛾羞黛怯人看)…·…七八菩薩蠻(風湖上蕭蕭雨)……八一清平樂(清淮濁汴)…二南鄉子(回首山)……八五勸金船(無情流水多情客)………八六南鄉子(東武望餘杭)……九〇浣溪沙(縹危樓紫翠間)……九二又白雪清詞出坐間)………九三南鄉子(帶石榴紅)………九五又(旌旆江湖)………九九定風波(千古風流阮步兵……一1减字木蘭花惟熊佳夢……一四南鄉子不到謝公·…一

@ -0,0 +1,9 @@
菩薩蠻天憐豪俊腰金晚………一九阮郎歸一年三度過蘇臺……111醉落魄(蒼颜華发…一四菩薩蠻玉笙不受朱暖……一一六采桑子多情多·仍多病……一人减字木蘭花銀筝旋品……1二1一醉落魄分攜如昨…………訴衷情小蓮初上琵琶弦……一二五更漏子水涵空…二八浣溪沙長記鳴琴子賤堂……一二九永遇樂長憶别時1|沁園春(孤館燈青)四南鄉子(寒雀滿疏籬)……一三八熙寧八年乙卯(一七五年)
蝶戀花(燈火錢塘三五夜)………一四〇江城子(十年生死雨茫茫)……一四一
日眼
雨中花慢(今歲花時深院)………一四三江城子(老夫聊發少年狂)·…一四六减字木蘭花(贤哉令尹)……一四九又(春亭下)…一五一熙寧九年丙辰(一〇七六年)
一叢花(今年春淺臘侵年)…一五四蝶戀花(簾外東風交雨)……一五七滿江紅(天豈無情)………一五九人娇(别.來時)……六一望江南(眷未老)……一六四又(春已老…一六六滿江紅(東武城南)……一六八臨江仙(九十日春都過了)……一七一水調歌頭(明月幾時有)……一七三河子(見說岷峨·馆)…一八一畫堂春(柳花飛處麥搖波)……一八五

@ -0,0 +1,11 @@
## 蘇轼詞编年校注
江城子(前瞻馬耳九仙山)……一八七又(相逢不覺又初寒)……一八九熙寧十年丁巳(一〇七七年)
陽關曲(濟南春好雪初晴……一九一浣溪沙四面垂楊十里荷……一九三又傅粉郎君又粉奴……一九五人娇滿院桃花………一九七洞仙歌江南臘盘…1滿庭芳香雕盤…二三浣溪沙紅照淺溪……二七陽關曲(暮雲收溢清寒……二九水調歌頭安石在東海……111浣溪沙一别姑蘇已四年……二1五菩薩蠻城隅静女何人見……三八臨江仙忘却成都來十载.…二二元豐元年戊午(一〇七八年)
臨江仙自古相從休務日……11Ⅲ蝶戀花(··無風花自a…二二六浣溪沙衡愧今年二麥豐…二二八又照日深紅暖見魚………二三其二(旋抹红妝看使君…二三二其三麻葉層層葉光……1三其四(嵌衣巾落花)·二三五其五(軟草平莎過雨新)……二三七蝶戀花(别酒君君一醉)……二三八南鄉子(凉簟碧紗厨).…二四三千秋歲(淺霜·)…二四五永遇樂(明月如霜)……二四七陽關曲(受降城下紫髯郎)……二五三浣溪沙(惟見眉·一點黄).…二五五元豐二年己未(一〇七九年)
南鄉子(.玉……二五七

@ -0,0 +1,9 @@
又(未倦長卿遊)……二六〇江城子(天涯流落思無窮)………二六二减字木蘭花(玉無味)……二六五江城子(墨雲拖雨西樓)……二六六南歌子(山雨瀟過)………二六八漁家傲(皎皎牽牛河漢女)……二七〇元豐三年庚申(一〇八〇年)
臨江仙(細馬遠馱雙侍女)…二七二卜算子(·月掛疏桐)……二七五南歌子(寸恨云短)……二八六南鄉子(晚景落瓊杯)……二八八菩薩蠻(檐初掛彎月)……二九一其二(風迴仙馭開扇)……二九三定風波(與客攜壺上翠微)…二九五水龍吟(楚山修竹如)……二九八菩薩蠻(翠鬟斜幔雲垂耳)……三〇四
其二(柳庭風静人眠畫)……三〇七其三(井雙照新妝冷)……三〇八其四(雪花飛暖融香頰)……三〇九元豐四年辛酉(一〇八一年)
少年遊(玉肌鉛粉傲秋霜)……三水龍吟(似花似非花)……三一四()少年(銀塘朱檻麴塵波)……三二九南鄉子(霜降水痕收)……三三一滿江红(江漢西來)…浣溪沙(块青青麥·未蘇)……三三九其二(醉夢蘸曉未蘇)……三四一其三(雪裹餐例蘇)……三四三其四(半夜銀山上積蘇)………三四四其五(萬頃風濤不記蘇)…·三四六江城子(黄昏猶是雨纖織)……三四七

@ -0,0 +1,11 @@
## 蘇轼詞编年校注
元豐五年壬戌(一〇八二年)
水龍吟(小舟截春江)…三四九江城子(夢中了了醉中醒)……三五二定風波(莫聽穿林打葉聲)三五六浣溪沙(山下蘭芽短侵溪).……三五八西江月(照野淺浪)………三六〇南歌子(日出山雨……三六四又(雨暗初疑夜)……三六七又(帶酒·山雨)…………三六八浣溪沙(西塞山白鷺飛).…三七〇漁父(父.)…三七六又(漁父醉)……三七七又(漁父醒)……三七八又(漁笑)…三七九調笑令父)…三八〇又(雁)……三八一
中册
念奴娇(大江東去)洞仙歌(冰肌玉骨)念奴娇(憑高眺遠)
三三三六三八三九三九六三九八$\fr0$ 四三四三四二四二四四四二

@ -0,0 +1,11 @@
又(雙墜)…四三九又(天真雅麗)…四四又柔和.)…四四二又(天然宅院)……四四三西江月(龍焙今年絶品)……四四五菩薩蠻(碧紗微露纖纖玉)……四四九醉翁操(琅然)四五一滿庭芳.(角虚名)……四五八定風波(好睡慵開莫厭).…四六二元豐六年癸亥(一〇八三年)
木蘭花令(烏啼鹊噪昏喬木)……四六四臨江仙(夜飲東坡醒復醉)……四六七好事近(红粉莫悲啼)…四六九滿庭芳(三十三年今誰存者)……四七一鸪天(林断山明竹牆).…四七四十拍子(白酒新開九)………四七六
目録
浣溪沙(傾蓋相逢勝白頭)………四七八又(炙手無人傍屋頭)………四八一水調歌頭(落日簾捲)……四八三南歌子(…霍元後)…四八七臨江仙(詩句端來磨我鈍)……四九减字木蘭花(江南遊女)…四九五阜羅特(采菱拾)……四九六元豐七年甲子(一〇八四年)
减字木蘭花(神聞意定)……四九九無愁可解.(景百年)……〇一滿庭芳(歸去來兮吾歸何處)……五〇六阮郎歸(綠槐高咽新蟬)……五一〇西江月(别已随流水)……五一二漁家傲(千古龍蟠並虎踞)……五一五水龍吟(露寒煙冷葭老)…五一八减字木蘭花(鄭莊好客)…二一

@ -0,0 +1,15 @@
蘇轼詞编年校注
南歌子(欲執河梁手)…五二六菩薩(買田陽羨吾將老)…五二七南歌子(見說東圓好)……三〇西江月(三過平山堂下)……五三三浣溪沙(學鴉兒正妙年)…五三七又(一夢江湖費五年)……五三九虞美人(波聲拍枕長淮曉)…五四一如夢令(城上層樓叠)……五四四又(水垢何曾相受)…五四六其二(自净方能.彼)…五四九浣溪沙(細雨斜風作曉寒)……五五〇行香子(北望平川)水龍吟(古來海茫茫)………五五六满庭芳(三十三年漂流江海)……五六三元豐八年乙丑(一〇八五年)
南鄉子(千騎試春遊)………五六六
滿庭芳(歸去來兮清溪無底)……五六八二…(器回头)又(昨夜秋風來萬里)…五七四又(自古漣漪佳绝地)……五七六宋哲宗元祐元年丙寅(一〇八六年)定風波(誰羨人間琢玉郎)………五七八如夢令(為向東坡傳語)……五八三其二(手種堂前桃李)·…五八六元祐二年丁卯(一〇八七年)
蘇遮(暑晴)…五八八元祐三年戊辰(一〇八八年)
哨徧(睡畫堂)…五九〇西江月(莫欺平·落落)…五九七元祐四年己巳(一〇八九年)
行香子(綺席終)……五九九漁家傲(送客歸來燈火盡)……六〇二

@ -0,0 +1,9 @@
浣溪沙(珠檜絲杉冷欲霜)………六〇五其二(霜真堪拒霜)…六〇七點绛(我輩情鍾)……六〇九元祐五年庚午(一〇九〇年)
臨江仙(多病休文都瘦損)……六一一南歌子(山與歌眉斂)……一三又古岸開青)…六一六鹊橋仙(…槎去)…六一八南歌子(海上乘槎侣)…二其二(再再中秋………六二四點.(不用悲秋)……六二五又(莫唱陽關……六二八又(聞倚牀……六三〇好事近(湖上雨晴時)………六三三浣溪沙(門外東風雪灑裾)……六三五南歌子(師唱家曲)…………六三七
四绿
元祐六年辛未(一〇九一年)
浣溪沙(雪颔霜不自驚)……六四一又(料峭東風翠驚)…六四三又(陽羨姑蘇已買田)……六四五木蘭花令(元宵似是歡遊好)…六四八减字木蘭花(雲容皓白)……六五一西江月(公子眼花亂發)……六五三其二(小院朱蘭幾曲)……六五七其三(怪此花枝怨泣)……六五八木蘭花令(知君仙骨無寒暑)……六六虞美人(歸心正似三春草)……六六三臨江仙(一别都門三改火)…六六五八聲甘州(有情風萬里捲潮來)…六六三减字木蘭花(天台舊路)……六七三西江月(昨夜舟京口)…六七五定風波(月滿溪照夜堂)……六七七

@ -0,0 +1,17 @@
## 蘇轼詞编年校注
臨江仙(我勸髯張歸去好)……六八三蝶戀花(春事蘭珊芳草歇)……六八六臨江仙(尊酒何人懷李白)…六八九南歌子(雲裁新綠)……六九二滿江紅(清潁東流)……六九五木蘭花令(霜餘已失長淮闊)……六九九减字木蘭花(空牀響琢)……七〇一元祐七年壬申(一〇九二年)
减字木蘭花(春庭月午)……七〇四木蘭花令(高平四面開雄壘)……七〇七浣溪沙(芍藥樱桃雨新)……七〇八减字木蘭花(回風落景)……七一二生查子(三度别君來)·…七一四青玉案(三年枕上昊中路)…七一六元祐八年癸酉(一〇九三年)
行香子(三人明)………七二
又(清夜無·)……七二五紹聖元年甲戌(一〇九四年)
戚氏(玉龜山)…二八歸朝歡(我夢扁舟浮震澤)……七三七木蘭花令(梧桐葉上三更雨)……七四一浣溪沙(幾共查梨到雪霜).七四三又(菊暗荷枯一夜霜)……七四五又(羅襪空飛洛浦塵)……七四七西江月(馬趁香微路遠)………七五紹聖二年乙亥(一〇九五年)
臨江仙(九十日春都過了)………七五一蝶戀花(花褪殘紅青杏小)……七五三减字木蘭花(閩溪珍)·…七五七人娇(白顏)………七五九浣溪沙(輕汗微微透碧紈)……七六二又(人袂輕風不破塵)………七六四

@ -0,0 +1,17 @@
賀新郎(乳燕飛華屋)……七六六紹聖三年丙子(一〇九六年)
蝶戀花(泛泛東風初破五)……七七六三部樂(美人如月)……七七九雨中花慢(嫩臉羞蛾因甚).…七八二西江月(玉骨那愁瘴霧)………七八五
减字木蘭花(春牛春杖)……〇一千秋歲(島邊天外)……八三踏青(改火初晴)…八〇六
四録
元符三年庚辰(1一年
减字木蘭花(海南奇寳)…………八〇九鸪天(笑紅梅翠翘)……八一二
下册
二蘇軾未编年詞三十九首及残句十一則木蘭花令(經旬未識東君信)……八一五西江月(聞道雙·鳳.)……一七烏夜啼(怪歸心甚速)………八一九臨江仙(冬夜夜寒冰合井)…八二二又(誰道東陽都瘦損)……八二三又(昨夜渡江何處宿)………八二五漁家傲(一曲陽關情幾許)……八二七定風波(莫怪鴛帶長)……八二九南鄉子(冰雪透香肌)………八三一又(天與工知……三三

@ -0,0 +1,5 @@
中國古典文學基本叢書
# 蘇軾詞編年校註
鄒同慶王宗堂著

@ -0,0 +1,7 @@
蘇轼詞编年校注
又(寒玉細凝)…八三四又(悵望春杯)…八三五又(何處倚干)……八三六菩薩蠻(落花聞院春衫薄)………八三九又(火凝汗揮珠顆)……八四〇又(蟠南江浅红梅小)…·八四一又(塗香莫惜莲承步)…八四二又(玉爨墜耳黄金飾)…··八四四浣溪沙(畫华横江喜再)……八四五又(風捲珠簾自上鈎)…·…八四七又(花滿銀塘水漫流)……八四八又(風壓輕雲貼水飛)……八五〇南歌子(紫陌春去)……八五三又(笑怕薇胃)……八五五蝶戀花(一顆樱桃樊素口)……八五七减字木蘭花(玉房金蕊)……八五九
又(鶯初.語)…八六一行香子(昨夜霜風)………八六二點绛膺(红杏飄香)……八六三虞美人(.杯遥勸天月)…八六六阮郎歸(暗香浮月黄昏)……八六七謁金門(帷裹)…八六九又(秋池·)………八七〇又(今夜雨)…八七一好事近(煙外倚危樓)……八七二天仙子(走馬探花花发未)…八七三翻香令(金爐猶靡煤殘)……八七五桃源憶故人(华胥夢断人何處)……八七六沁春(情若連環)…八七七残句十一.八七九
## 蘇轼年校附…八八五

@ -0,0 +1,9 @@
一、他集互見詞八首
菩薩蠻(娟娟侵鬢妝痕淺)……八八七江城子(銀濤無際捲蓬瀛)………八八九減字木蘭花(誰妙筆)……八九三點絳.(醉漾輕舟)……八九六又(月轉烏)………八九八訴衷情(海棠珠綴一重重)………九〇醉落魄(醉醒醒醉)……九〇二瑶池燕(飛花成陣春心困)………九〇五二、蘇轼存疑詞十一首
蝶戀花(記得畫屏初會遇)……九〇九又(雨疏疏經潑火)…九一〇又(蝶懶慵春過半)………九一一雨中花慢(院重簾何處).…九一二浣溪沙(山色横侵蘸霞)……九一四江城子(腻紅匀臉襯檀屑)………九一七
虞美人(冰肌自是生來瘦)………九一八又(深深庭院.明過)……九二〇西江月(碧輕兩鳳)……九二二踏莎行(个禿奴)…九二三麟鸪天(羅帶雙垂不成)……九二五三誤入蘇集詞五十三首及残句九則鸪天(塞山白鷺飛)………九二九江城子(南來飛燕北歸鴻)……九三一沁春(小·深沈)……九三二虞美人(落花已作風前舞)………九三三蝶戀花(玉枕冰寒消暑氣)……九三五又(梨…初紅蟬韻歇)………九三六又(簾幕風輕雙語燕)……九三七又(一雾秋風驚畫扇)……九三八又(紫菊初生朱槿墜)……九三八永遇樂(天山)………九三九

@ -0,0 +1,5 @@
蘇轼编年校
意難忘(花摊鴛房)蒲庭芳(北苑龍圃)…九四二定風波(痛飲形骸騎蹇驢)…九四四人嬌(解了癡)…九四四浣溪沙(晚菊花前斂翠蛾……九四六又(玉冰寒滴露华)………九四七又(樓依江百尺高)……九四八阮郎歸(歌停檀板舞停鸞)……九四九菩薩蠻(濕雲不動溪橋冷)……九五〇木蘭花(檀槽碎響金絲撥)………九五一又(個人豐韻真堪羨)………九五二玉樓春(東風就腰兒細)……九五三如夢令(嘗記溪亭日暮)……九五四又(曾宴桃源深洞)……九五五點绛膺(高柳蟬晰)…九五六又(蹴丽秋千)…九八
又(春雨)………九九又(鶯踏花)…………九六〇祝英臺近()……九六一水調歌頭(已過一番雨)……九六二離别一久九六二洞仙歌(飛梁水)…九六三金菊對芙蓉(花則一名)……九六五踏青遊(·人人…九六六西江月(雨過輕風弄柳)…九六七探春令(玉窗蠅字記春寒)………九六八億秦娥(香馥馥)九六八滿江紅(不作三公)…九七〇卜算子(水是波.)……九七〇更漏子(柳絲長)……九七一又(春夜阐)…九七二清平調引(陌上花開蝴蝶飛).…九七三

@ -0,0 +1,5 @@
又(陌上山花無數開)…………九七三又(生前富貴草頭)……九七三履霜操(桓山之上)…九七四導引歌辭(父老)……九七五又(經文)……九七五踏莎行(山秀芙蓉)……九七六菩薩蠻(城頭尚有三鼓)……九七八西江月(古渡水.明月)…………九七九蝶戀花(花拂壶香徑小)…………九七九洞仙歌(殿角.生)…九八〇阮郎歸(夕陽滿樹亂鳴蟬)………九八〇残句九…九八一
目録
蘇轼年校註附錄……九九一一、蘇轼傳記九九三(一)蘇辙(亡兄子瞻端明墓誌銘〉…九九三(二)〈《宋·轼·》川二一六三、劉尚榮蘇軾集版本綜述》…一四二四序跋…五八蘇軾詞编年校註主要引用書目…一七三後記篇目筆畫索引川校改後記111

@ -0,0 +1 @@
# 蘇軾詞編年校註正編

@ -0,0 +1,17 @@
## 一、 蘇軾编年詞二九二首
華清引感首1
平時十月幸蓮湯三①玉甃瓊梁②°五家車馬如水③,珠璣满路旁④°牀⑤°獨留煙樹蒼蒼⑥°至今清夜月,依前過繚牆(三①°
## 【】
翠華一去掩方
,,(華清引〉前後片各四句,共四十五字,用平聲韻°二調無涉。傅本、元本無題。
三「前」毛本作[舊
【编年】
一蘇轼编年詞二九二首华清引三

@ -0,0 +1,11 @@
蘇轼詞编年校注
手车)年丙寅(一〇八六年)逆数二十三年,恰為治平元年甲辰公以是年罷鳳翔任,過長安,游驪山,作〈驪山三絕句》詩,(華清引》詞亦應作於此時
,月戊申,改温泉宫為華清宫。至天寶十四载,每年十月均[幸華清宫°「幸漢·蔡邕(獨断〉上:「天子所至,,日室。此指华清池的温泉浴室。
工工工,,水日,中

@ -0,0 +1,17 @@
瓷:本指井壁,此指温泉浴池池壁。
车绿,領袖正白,顧視御者,不及邀矣。」
珠满路旁:(舊唐書》卷五一(楊貴妃傳》「玄宗每年十月幸華清宫,國忠姊妹五家息從,每家為一隊,著一色,,出
。,
·。,。毁廢已久,今所存者,唯繚垣耳。」
## 一 蘇轼编年詞二九二首華清引

@ -0,0 +1,13 @@
蘇轼铜编年校注
一斛珠]
洛城春晚垂楊亂掩紅樓半①小池輕浪紋如篆②燭下花前,曾醉離歌宴流雲雨散關山有限情無限③待君重見尋芳伴為説相思,目断西樓燕④。
【校勘】
(一傅本、元本未收°案:調名(一斛珠)即(醉落魄〉°【编年】
氏父子三人赴京應試,三月從眉州出發,途經成都閩中,出褒斜谷,發横渠鎮,人鳳翔驛,過長安、洛陽,五、六月間到達汴京。此次經過洛陽時當在五月下旬二是嘉祐二年(一〇五七年),蘇軾母親病故,父子三人于是年五月離汴京,赴喪返家此次途經洛陽時亦當在五月三是嘉祐六年(一〇六一年),蘇轼在京,被任命為大理評事簽書鳳翔府判官。十一月離京赴任,十二月十四日到逹鳳翔,途經

@ -0,0 +1,5 @@
# 蘇軾詞編年校註 中
中國古典文學基本叢書
鄒同慶王宗堂著

@ -0,0 +1,5 @@
中國古典文學基本叢書
蘇軾詞編年校註下
鄒同慶王宗堂著

@ -0,0 +1,23 @@
圖書在版編目CIP数據
蘇軾詞編年校注/鄒同慶王宗堂著一北京中華書局2002
(中國古典文學基本叢書)
ISBN7-101-02603-6
.蘇…Ⅱ.①鄒…②王…Ⅲ.蘇軾(1036~1101
-宋詞-文學研究 Ⅳ.207.23中國版本書館CIP数據核字2000第38851號
責任编輯:劉尚榮
蘇軾詞编年校註(全三册)鄒同慶 王宗堂著
中華書局出版發行北京市豐臺區太平橋西里38號10073北京冠中印刷廠印刷
850×1168毫米1/32·36%印张·1261千字2002年9月第1版2002年9月北京第1次印刷印数1-5000册定價65.00元
ISBN7-101-02603-6/I·360

@ -0,0 +1,7 @@
地位。
大家知道,公元九六年北宋王朝建立,結束了唐末五代長期分裂割據的局面。到了仁宗慶暦時期,一方面,由於將近百年的承平,社會經濟繁榮,促進了文化的繁榮。另一方面,由於國家内外危機的晏殊張先柳永等的詞依舊盛行。儘管如此,詩文革新運動也為詞的革新準備了條件,在范仲淹、歐陽大中工

@ -0,0 +1,22 @@
蘇轼詞编年校注
上起了迥狂攔於既倒,障百川而東之的作用
詞人,卒於元豐元年(一〇七八),蘇軾這年四十三歲。他在(祭張子野文》中説:
(,並進一步說:詞即是古人的詩。在(與蔡景繁簡》中說:
頒示新詞,此古人長短句詩也。得之驚喜。試勉繼之,晚即面呈。((東坡續集》卷十一)
又在(答陈季常簡》中云:
又惠新詞,句句警拔,詩人之雄,非小詞也。但豪放太過,恐造物者不容人如此快活。((東坡
續集》卷十一)
這些可以說是蘇軾詞體革新的理論網領。在這裹他告訴我們兩點:(一)詞是詩的一體,當與詩同等看手工(無愁可解》(光景百年)一首,唐注:「案此詞向載各本東坡詞中,今據(山谷题跋》卷九、魏衍(後山詩註》卷九(答田生詩註》陳應行(于湖先生長短句序》移出錄此。詞序乃蘇軾所撰。蘇軾[簡中所云[豪放太過者,当即此作。詞的内容亦與[如此快活之意相合

@ -0,0 +1,11 @@
中工中)寒)(詩題為(中秋作》),(生查子(三度别君來)(詩題為(古别離送蘇伯固》),過去皆收人詩集。其二是集句詞此體始於王安石的(菩薩蠻》(数間茅屋閑臨水)(海棠亂發皆臨水),蘇軾用此體寫了三首(南鄉子》,乃集杜甫、韓愈、白居易劉禹錫、鄭谷、許渾、杜牧、李商隱、崔塗、韓、吴融等人的詩句為之又有(定風波》(雨洗娟娟嫩葉光),是集杜甫與白居易詩句為之王作豪放,蘇作近于婉約。其三是括詞。這是蘇軾的創舉,也有可能是受韓愈(月蝕詩效玉川子作》括盧仝(月蝕詩》的啟發。在此體中,他以(臨江仙》(冬夜夜寒冰合井)括李白的(夜坐吟》,(定風波》(與客攜壶上翠微)括杜牧的(九日齊安登高》(好睡慵開莫厭运)括他自己的(紅梅〉,(水調歌頭》(昵昵兒女語)括韓愈的(聽穎師兩篇引進許多散文句式,合文人樂,遂開以文為詞的新路。至於他詞中直接引用或暗中化用古人詩句,俯拾即是。這些都可以說明蘇軾是在突破詩詞的畛域,他的填詞,實際上是在詞的形式下作詩。
他開始填詞的時間,從現存的作品看,是在任杭州通判時期,這是在他已經寫過(鳳翔八觀)的(石
鼓歌王維吴道子畫》等名作以後,也就是在他詩的清雄風格形成以後。因而他初試詞筆,為表達内容
的需要,就帶着詩的風格人詞。可以說,這是他以詩為詞的開始,也是他建立豪放風格的開始。

@ -0,0 +1,7 @@
https://github.com/PaddlePaddle/PaddleOCR
【2023.4.29】OCR识别__百度飞桨PaddleOCR测试及环境搭建详解
https://www.bilibili.com/video/BV1w14y1Z7bD/?vd_source=13b33731bb79a73783e9f2c0e11857ae
【安装文档】
https://github.com/PaddlePaddle/PaddleOCR/blob/main/docs/quick_start.md

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

@ -0,0 +1,22 @@
prune .github
prune applications
prune benchmark
prune configs
prune deploy
prune doc
prune docs
prune overrides
prune ppocr/ext_op
prune ppocr/losses
prune ppocr/metrics
prune ppocr/modeling
prune ppocr/optimizer
prune ppstructure/docs
prune test_tipc
prune tests
exclude .clang_format.hook
exclude .gitignore
exclude .pre-commit-config.yaml
exclude .style.yapf
exclude mkdocs.yml
exclude train.sh

@ -0,0 +1,286 @@
<div align="center">
<p>
<img width="100%" src="./docs/images/Banner_cn.png" alt="PaddleOCR Banner"></a>
</p>
<!-- language -->
中文 | [English](./README_en.md)
<!-- icon -->
[![stars](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR)
[![Downloads](https://img.shields.io/pypi/dm/paddleocr)](https://pypi.org/project/PaddleOCR/)
![python](https://img.shields.io/badge/python-3.8~3.12-aff.svg)
![os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg)
![hardware](https://img.shields.io/badge/hardware-cpu%2C%20gpu%2C%20xpu%2C%20npu-yellow.svg)
[![Website](https://img.shields.io/badge/Website-PaddleOCR-blue?logo=)](https://www.paddleocr.ai/)
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
</div>
## 🚀 简介
PaddleOCR自发布以来凭借学术前沿算法和产业落地实践受到了产学研各方的喜爱并被广泛应用于众多知名开源项目例如Umi-OCR、OmniParser、MinerU、RAGFlow等已成为广大开发者心中的开源OCR领域的首选工具。2025年5月20日飞桨团队发布**PaddleOCR 3.0**,全面适配**飞桨框架3.0正式版**,进一步**提升文字识别精度**,支持**多文字类型识别**和**手写体识别**,满足大模型应用对**复杂文档高精度解析**的旺盛需求,结合**文心大模型4.5 Turbo**显著提升关键信息抽取精度,并新增**对昆仑芯、昇腾等国产硬件**的支持。完整使用文档请参考 [PaddleOCR 3.0 文档](https://paddlepaddle.github.io/PaddleOCR/latest/)。
PaddleOCR 3.0**新增**三大特色能力:
- 全场景文字识别模型[PP-OCRv5](docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.md):单模型支持五种文字类型和复杂手写体识别;整体识别精度相比上一代**提升13个百分点**。[在线体验](https://aistudio.baidu.com/community/app/91660/webUI)
- 通用文档解析方案[PP-StructureV3](docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.md):支持多场景、多版式 PDF 高精度解析,在公开评测集中**领先众多开源和闭源方案**。[在线体验](https://aistudio.baidu.com/community/app/518494/webUI)
- 智能文档理解方案[PP-ChatOCRv4](docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.md)原生支持文心大模型4.5 Turbo精度相比上一代**提升15个百分点**。[在线体验](https://aistudio.baidu.com/community/app/518493/webUI)
PaddleOCR 3.0除了提供优秀的模型库外还提供好学易用的工具覆盖模型训练、推理和服务化部署方便开发者快速落地AI应用。
<div align="center">
<p>
<img width="100%" src="./docs/images/Arch_cn.png" alt="PaddleOCR Architecture"></a>
</p>
</div>
## 📣 最新动态
🔥🔥2025.06.05: **PaddleOCR 3.0.1** 发布,包含:
- **优化部分模型和模型配置:**
- 更新 PP-OCRv5默认模型配置检测和识别均由mobile改为server模型。为了改善大多数的场景默认效果配置中的参数`limit_side_len`由736改为64
- 新增文本行方向分类`PP-LCNet_x1_0_textline_ori`模型精度99.42%OCR、PP-StructureV3、PP-ChatOCRv4产线的默认文本行方向分类器改为该模型
- 优化文本行方向分类`PP-LCNet_x0_25_textline_ori`模型精度提升3.3个百分点当前精度98.85%
- **优化和修复3.0.0版本部分存在的问题,[详情](https://paddlepaddle.github.io/PaddleOCR/latest/update/update.html)**
🔥🔥2025.05.20: **PaddleOCR 3.0** 正式发布,包含:
- **PP-OCRv5**: 全场景高精度文字识别
1. 🌐 单模型支持**五种**文字类型(**简体中文**、**繁体中文**、**中文拼音**、**英文**和**日文**)。
2. ✍️ 支持复杂**手写体**识别:复杂连笔、非规范字迹识别性能显著提升。
3. 🎯 整体识别精度提升 - 多种应用场景达到 SOTA 精度, 相比上一版本PP-OCRv4识别精度**提升13个百分点**
- **PP-StructureV3**: 通用文档解析方案
1. 🧮 支持多场景 PDF 高精度解析,在 OmniDocBench 基准测试中**领先众多开源和闭源方案**。
2. 🧠 多项专精能力: **印章识别**、**图表转表格**、**嵌套公式/图片的表格识别**、**竖排文本解析**及**复杂表格结构分析**等。
- **PP-ChatOCRv4**: 智能文档理解方案
1. 🔥 文档图像PDF/PNG/JPG关键信息提取精度相比上一代**提升15个百分点**
2. 💻 原生支持**文心大模型4.5 Turbo**,还兼容 PaddleNLP、Ollama、vLLM 等工具部署的大模型。
3. 🤝 集成 [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2),支持印刷文字、手写体文字、印章信息、表格、图表等常见的复杂文档信息抽取和理解的能力。
## ⚡ 快速开始
### 1. 在线体验
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
### 2. 本地安装
请参考[安装指南](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)完成**PaddlePaddle 3.0**的安装然后安装paddleocr。
```bash
# 安装 paddleocr
pip install paddleocr
```
### 3. 命令行方式推理
```bash
# 运行 PP-OCRv5 推理
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
# 运行 PP-StructureV3 推理
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
# 运行 PP-ChatOCRv4 推理前需要先获得千帆API Key
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
# 查看 "paddleocr ocr" 详细参数
paddleocr ocr --help
```
### 4. API方式推理
**4.1 PP-OCRv5 示例**
```python
from paddleocr import PaddleOCR
# 初始化 PaddleOCR 实例
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False)
# 对示例图像执行 OCR 推理
result = ocr.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
# 可视化结果并保存 json 结果
for res in result:
res.print()
res.save_to_img("output")
res.save_to_json("output")
```
<details>
<summary><strong>4.2 PP-StructureV3 示例</strong></summary>
```python
from pathlib import Path
from paddleocr import PPStructureV3
pipeline = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
# For Image
output = pipeline.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
)
# 可视化结果并保存 json 结果
for res in output:
res.print()
res.save_to_json(save_path="output")
res.save_to_markdown(save_path="output")
```
</details>
<details>
<summary><strong>4.3 PP-ChatOCRv4 示例</strong></summary>
```python
from paddleocr import PPChatOCRv4Doc
chat_bot_config = {
"module_name": "chat_bot",
"model_name": "ernie-3.5-8k",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "openai",
"api_key": "api_key", # your api_key
}
retriever_config = {
"module_name": "retriever",
"model_name": "embedding-v1",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "qianfan",
"api_key": "api_key", # your api_key
}
pipeline = PPChatOCRv4Doc(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
visual_predict_res = pipeline.visual_predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
use_common_ocr=True,
use_seal_recognition=True,
use_table_recognition=True,
)
mllm_predict_info = None
use_mllm = False
# 如果使用多模态大模型,需要启动本地 mllm 服务可以参考文档https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.md 进行部署,并更新 mllm_chat_bot_config 配置。
if use_mllm:
mllm_chat_bot_config = {
"module_name": "chat_bot",
"model_name": "PP-DocBee",
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
"api_type": "openai",
"api_key": "api_key", # your api_key
}
mllm_predict_res = pipeline.mllm_pred(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
key_list=["驾驶室准乘人数"],
mllm_chat_bot_config=mllm_chat_bot_config,
)
mllm_predict_info = mllm_predict_res["mllm_res"]
visual_info_list = []
for res in visual_predict_res:
visual_info_list.append(res["visual_info"])
layout_parsing_result = res["layout_parsing_result"]
vector_info = pipeline.build_vector(
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
)
chat_result = pipeline.chat(
key_list=["驾驶室准乘人数"],
visual_info=visual_info_list,
vector_info=vector_info,
mllm_predict_info=mllm_predict_info,
chat_bot_config=chat_bot_config,
retriever_config=retriever_config,
)
print(chat_result)
```
</details>
### 5. **国产化硬件使用**
- [昆仑芯安装指南](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_XPU.html)
- [昇腾安装指南](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_NPU.html)
## ⛰️ 进阶指南
- [PP-OCRv5 使用教程](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
- [PP-StructureV3 使用教程](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
- [PP-ChatOCRv4 使用教程](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
## 🔄 效果展示
<div align="center">
<p>
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 Demo"></a>
</p>
</div>
<div align="center">
<p>
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 Demo"></a>
</p>
</div>
## 👩‍👩‍👧‍👦 开发者社区
| 扫码关注飞桨公众号 | 扫码加入技术交流群 |
| :---: | :---: |
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
## 🏆 使用 PaddleOCR 的优秀项目
PaddleOCR 的发展离不开社区贡献!💗衷心感谢所有开发者、合作伙伴与贡献者!
| 项目名称 | 简介 |
| ------------ | ----------- |
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|基于RAG的AI工作流引擎|
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|多类型文档转换Markdown工具|
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|开源批量离线OCR软件|
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |基于纯视觉的GUI智能体屏幕解析工具|
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |基于任意内容的问答系统|
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|高效复杂PDF文档提取工具包|
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |屏幕实时翻译工具|
| [更多项目](./awesome_projects.md) | |
## 👩‍👩‍👧‍👦 贡献者
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
</a>
## 🌟 Star
[![Star History Chart](https://api.star-history.com/svg?repos=PaddlePaddle/PaddleOCR&type=Date)](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
## 📄 许可协议
本项目的发布受[Apache 2.0 license](LICENSE)许可认证。
## 🎓 学术引用
```
@misc{paddleocr2020,
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
year={2020}
}
```

@ -0,0 +1,305 @@
<div align="center">
<p>
<img width="100%" src="./docs/images/Banner.png" alt="PaddleOCR Banner"></a>
</p>
<!-- language -->
[中文](./readme_c.md)| English
<!-- icon -->
[![stars](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR)
[![Downloads](https://img.shields.io/pypi/dm/paddleocr)](https://pypi.org/project/PaddleOCR/)
![python](https://img.shields.io/badge/python-3.83.12-aff.svg)
![os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg)
![hardware](https://img.shields.io/badge/hardware-cpu%2C%20gpu%2C%20xpu%2C%20npu-yellow.svg)
[![Website](https://img.shields.io/badge/Website-PaddleOCR-blue?logo=)](https://www.paddleocr.ai/)
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
</div>
## 🚀 Introduction
Since its initial release, PaddleOCR has gained widespread acclaim across academia, industry, and research communities, thanks to its cutting-edge algorithms and proven performance in real-world applications. Its already powering popular open-source projects like Umi-OCR, OmniParser, MinerU, and RAGFlow, making it the go-to OCR toolkit for developers worldwide.
On May 20, 2025, the PaddlePaddle team unveiled PaddleOCR 3.0, fully compatible with the official release of the **PaddlePaddle 3.0** framework. This update further **boosts text-recognition accuracy**, adds support for **multiple text-type recognition** and **handwriting recognition**, and meets the growing demand from large-model applications for **high-precision parsing of complex documents**. When combined with the **ERNIE 4.5T**, it significantly enhances key-information extraction accuracy. PaddleOCR 3.0 also introduces support for domestic hardware platforms such as **KUNLUNXIN** and **Ascend**. For the complete usage documentation, please refer to the [PaddleOCR 3.0 Documentation](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html).
Three Major New Features in PaddleOCR 3.0:
- Universal-Scene Text Recognition Model [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): A single model that handles five different text types plus complex handwriting. Overall recognition accuracy has increased by 13 percentage points over the previous generation. [Online Demo](https://aistudio.baidu.com/community/app/91660/webUI)
- General Document-Parsing Solution [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): Delivers high-precision parsing of multi-layout, multi-scene PDFs, outperforming many open- and closed-source solutions on public benchmarks. [Online Demo](https://aistudio.baidu.com/community/app/518494/webUI)
- Intelligent Document-Understanding Solution [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): Natively powered by the WenXin large model 4.5T, achieving 15 percentage points higher accuracy than its predecessor. [Online Demo](https://aistudio.baidu.com/community/app/518493/webUI)
In addition to providing an outstanding model library, PaddleOCR 3.0 also offers user-friendly tools covering model training, inference, and service deployment, so developers can rapidly bring AI applications to production.
<div align="center">
<p>
<img width="100%" src="./docs/images/Arch.png" alt="PaddleOCR Architecture"></a>
</p>
</div>
## 📣 Recent updates
#### **🔥🔥 2025.06.05: Release of PaddleOCR 3.0.1, includes:**
- **Optimisation of certain models and model configurations:**
- Updated the default model configuration for PP-OCRv5, changing both detection and recognition from mobile to server models. To improve default performance in most scenarios, the parameter `limit_side_len` in the configuration has been changed from 736 to 64.
- Added a new text line orientation classification model `PP-LCNet_x1_0_textline_ori` with an accuracy of 99.42%. The default text line orientation classifier for OCR, PP-StructureV3, and PP-ChatOCRv4 pipelines has been updated to this model.
- Optimised the text line orientation classification model `PP-LCNet_x0_25_textline_ori`, improving accuracy by 3.3 percentage points to a current accuracy of 98.85%.
- **Optimizations and fixes for some issues in version 3.0.0, [details](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)**
🔥🔥2025.05.20: Official Release of **PaddleOCR v3.0**, including:
- **PP-OCRv5**: High-Accuracy Text Recognition Model for All Scenarios - Instant Text from Images/PDFs.
1. 🌐 Single-model support for **five** text types - Seamlessly process **Simplified Chinese, Traditional Chinese, Simplified Chinese Pinyin, English** and **Japanse** within a single model.
2. ✍️ Improved **handwriting recognition**: Significantly better at complex cursive scripts and non-standard handwriting.
3. 🎯 **13-point accuracy gain** over PP-OCRv4, achieving state-of-the-art performance across a variety of real-world scenarios.
- **PP-StructureV3**: General-Purpose Document Parsing Unleash SOTA Images/PDFs Parsing for Real-World Scenarios!
1. 🧮 **High-Accuracy multi-scene PDF parsing**, leading both open- and closed-source solutions on the OmniDocBench benchmark.
2. 🧠 Specialized capabilities include **seal recognition**, **chart-to-table conversion**, **table recognition with nested formulas/images**, **vertical text document parsing**, and **complex table structure analysis**.
- **PP-ChatOCRv4**: Intelligent Document Understanding Extract Key Information, not just text from Images/PDFs.
1. 🔥 **15-point accuracy gain** in key-information extraction on PDF/PNG/JPG files over the previous generation.
2. 💻 Native support for **ERINE4.5 Turbo**, with compatibility for large-model deployments via PaddleNLP, Ollama, vLLM, and more.
3. 🤝 Integrated [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2), enabling extraction and understanding of printed text, handwriting, seals, tables, charts, and other common elements in complex documents.
<details>
<summary><strong>The history of updates </strong></summary>
- 🔥🔥2025.03.07: Release of **PaddleOCR v2.10**, including:
- **12 new self-developed models:**
- **[Layout Detection series](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/layout_detection.html)**(3 models): PP-DocLayout-L, M, and S -- capable of detecting 23 common layout types across diverse document formats(papers, reports, exams, books, magazines, contracts, etc.) in English and Chinese. Achieves up to **90.4% mAP@0.5** , and lightweight features can process over 100 pages per second.
- **[Formula Recognition series](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/formula_recognition.html)**(2 models): PP-FormulaNet-L and S -- supports recognition of 50,000+ LaTeX expressions, handling both printed and handwritten formulas. PP-FormulaNet-L offers **6% higher accuracy** than comparable models; PP-FormulaNet-S is 16x faster while maintaining similar accuracy.
- **[Table Structure Recognition series](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/table_structure_recognition.html)**(2 models): SLANeXt_wired and SLANeXt_wireless -- newly developed models with **6% accuracy improvement** over SLANet_plus in complex table recognition.
- **[Table Classification](https://paddlepaddle.github.io/PaddleX/latest/en/module_usage/tutorials/ocr_modules/table_classification.html)**(1 model):
PP-LCNet_x1_0_table_cls -- an ultra-lightweight classifier for wired and wireless tables.
[Learn more](https://paddlepaddle.github.io/PaddleOCR/latest/en/update.html)
</details>
## ⚡ Quick Start
### 1. Run online demo
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
### 2. Installation
Install PaddlePaddle refer to [Installation Guide](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html), after then, install the PaddleOCR toolkit.
```bash
# Install paddleocr
pip install paddleocr
```
### 3. Run inference by CLI
```bash
# Run PP-OCRv5 inference
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
# Run PP-StructureV3 inference
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
# Get the Qianfan API Key at first, and then run PP-ChatOCRv4 inference
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
# Get more information about "paddleocr ocr"
paddleocr ocr --help
```
### 4. Run inference by API
**4.1 PP-OCRv5 Example**
```python
# Initialize PaddleOCR instance
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False)
# Run OCR inference on a sample image
result = ocr.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
# Visualize the results and save the JSON results
for res in result:
res.print()
res.save_to_img("output")
res.save_to_json("output")
```
<details>
<summary><strong>4.2 PP-StructureV3 Example</strong></summary>
```python
from pathlib import Path
from paddleocr import PPStructureV3
pipeline = PPStructureV3()
# For Image
output = pipeline.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
# Visualize the results and save the JSON results
for res in output:
res.print()
res.save_to_json(save_path="output")
res.save_to_markdown(save_path="output")
```
</details>
<details>
<summary><strong>4.3 PP-ChatOCRv4 Example</strong></summary>
```python
from paddleocr import PPChatOCRv4Doc
chat_bot_config = {
"module_name": "chat_bot",
"model_name": "ernie-3.5-8k",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "openai",
"api_key": "api_key", # your api_key
}
retriever_config = {
"module_name": "retriever",
"model_name": "embedding-v1",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "qianfan",
"api_key": "api_key", # your api_key
}
pipeline = PPChatOCRv4Doc(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
visual_predict_res = pipeline.visual_predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
use_common_ocr=True,
use_seal_recognition=True,
use_table_recognition=True,
)
mllm_predict_info = None
use_mllm = False
# If a multimodal large model is used, the local mllm service needs to be started. You can refer to the documentation: https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.m d performs deployment and updates the mllm_chat_bot_config configuration.
if use_mllm:
mllm_chat_bot_config = {
"module_name": "chat_bot",
"model_name": "PP-DocBee",
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
"api_type": "openai",
"api_key": "api_key", # your api_key
}
mllm_predict_res = pipeline.mllm_pred(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
key_list=["驾驶室准乘人数"],
mllm_chat_bot_config=mllm_chat_bot_config,
)
mllm_predict_info = mllm_predict_res["mllm_res"]
visual_info_list = []
for res in visual_predict_res:
visual_info_list.append(res["visual_info"])
layout_parsing_result = res["layout_parsing_result"]
vector_info = pipeline.build_vector(
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
)
chat_result = pipeline.chat(
key_list=["驾驶室准乘人数"],
visual_info=visual_info_list,
vector_info=vector_info,
mllm_predict_info=mllm_predict_info,
chat_bot_config=chat_bot_config,
retriever_config=retriever_config,
)
print(chat_result)
```
</details>
### 5. Domestic AI Accelerators
- [Huawei Ascend](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_NPU.html)
- [KUNLUNXIN](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_XPU.html)
## ⛰️ Advanced Tutorials
- [PP-OCRv5 Tutorial](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
- [PP-StructureV3 Tutorial](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
- [PP-ChatOCRv4 Tutorial](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
## 🔄 Quick Overview of Execution Results
<div align="center">
<p>
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 Demo"></a>
</p>
</div>
<div align="center">
<p>
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 Demo"></a>
</p>
</div>
## 👩‍👩‍👧‍👦 Community
| PaddlePaddle WeChat official account | Join the tech discussion group |
| :---: | :---: |
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
## 😃 Awesome Projects Leveraging PaddleOCR
PaddleOCR wouldnt be where it is today without its incredible community! 💗 A massive thank you to all our longtime partners, new collaborators, and everyone whos poured their passion into PaddleOCR — whether weve named you or not. Your support fuels our fire!
| Project Name | Description |
| ------------ | ----------- |
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|RAG engine based on deep document understanding.|
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|Multi-type Document to Markdown Conversion Tool|
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|Free, Open-source, Batch Offline OCR Software.|
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |OmniParser: Screen Parsing tool for Pure Vision Based GUI Agent.|
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |Question and Answer based on Anything.|
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|A powerful open-source toolkit designed to efficiently extract high-quality content from complex and diverse PDF documents.|
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |Recognize text on the screen, translate it and show the translation results in real time.|
| [Learn more projects](./awesome_projects.md) | [More projects based on PaddleOCR](./awesome_projects.md)|
## 👩‍👩‍👧‍👦 Contributors
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
</a>
## 🌟 Star
[![Star History Chart](https://api.star-history.com/svg?repos=PaddlePaddle/PaddleOCR&type=Date)](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
## 📄 License
This project is released under the [Apache 2.0 license](LICENSE).
## 🎓 Citation
```
@misc{paddleocr2020,
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
year={2020}
}
```

@ -0,0 +1 @@
移步[docs](https://paddlepaddle.github.io/PaddleOCR/latest/applications/overview.html)

@ -0,0 +1,28 @@
## 😃 Awesome projects based on PaddleOCR
💗 PaddleOCR wouldnt be where it is today without its incredible community! A massive 🙌 thank you 🙌 to all our longtime partners, new collaborators, and everyone whos poured their passion into PaddleOCR — whether weve named you or not. Your support fuels our fire! 🔥
| Project Name | Description |
| ------------ | ----------- |
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|Free, Open-source, Batch Offline OCR Software.|
| [LearnOpenCV](http://github.com/spmallick/learnopencv) <a href="http://github.com/spmallick/learnopencv"><img src="https://img.shields.io/github/stars/spmallick/learnopencv"></a> | code for Computer Vision, Deep learning, and AI research articles.|
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |OmniParser: Screen Parsing tool for Pure Vision Based GUI Agent.|
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |Question and Answer based on Anything.|
| [PaddleHub](https://github.com/PaddlePaddle/PaddleHub)<a href="https://github.com/PaddlePaddle/PaddleHub"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleHub"></a> |400+ AI Models: Rich, high-quality AI models, including CV, NLP, Speech, Video and Cross-Modal.|
| [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP)<a href="https://github.com/PaddlePaddle/PaddleNLP"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleNLP"></a> |A Large Language Model (LLM) development suite based on the PaddlePaddle.|
| [Rerun](https://github.com/rerun-io/rerun) <a href="https://github.com/rerun-io/rerun"><img src="https://img.shields.io/github/stars/rerun-io/rerun"></a> | Rerun is building the multimodal data stack to model, ingest, store, query and view robotics-style data |
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator) <a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> | Recognize text on the screen, translate it and show the translation results in real time.|
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a> | PDF-Extract-Kit is a powerful open-source toolkit designed to efficiently extract high-quality content from complex and diverse PDF documents. |
| [manga-image-translator](https://github.com/zyddnys/manga-image-translator) <a href="https://github.com/zyddnys/manga-image-translator"><img src="https://img.shields.io/github/stars/zyddnys/manga-image-translator"></a> | Translate texts in manga/images.|
| [March7thAssistant](https://github.com/moesnow/March7thAssistant) <a href="https://github.com/moesnow/March7thAssistant"><img src="https://img.shields.io/github/stars/moesnow/March7thAssistant"></a> | Daily Tasks: Stamina recovery, daily training, claiming rewards, commissions, and farming. |
| [PaddlePaddle/models](https://github.com/PaddlePaddle/models) <a href="https://github.com/PaddlePaddle/models"><img src="https://img.shields.io/github/stars/PaddlePaddle/models"></a> |PaddlePaddle's industrial-grade model zoo.|
| [katanaml/sparrow](https://github.com/katanaml/sparrow) <a href="https://github.com/katanaml/sparrow"><img src="https://img.shields.io/github/stars/katanaml/sparrow"></a> | Sparrow is an innovative open-source solution for efficient data extraction and processing from various documents and images. |
| [RapidOCR](https://github.com/RapidAI/RapidOCR) <a href="https://github.com/RapidAI/RapidOCR"><img src="https://img.shields.io/github/stars/RapidAI/RapidOCR"></a> | Awesome OCR multiple programing languages toolkits based on ONNXRuntime, OpenVINO, PaddlePaddle and PyTorch |
| [autoMate](https://github.com/yuruotong1/autoMate) <a href="https://github.com/yuruotong1/autoMate"><img src="https://img.shields.io/github/stars/yuruotong1/autoMate"></a> | AI-Powered Local Automation Tool & Let Your Computer Work for You. |
| [Agent-S](https://github.com/simular-ai/Agent-S) <a href="https://github.com/simular-ai/Agent-S"><img src="https://img.shields.io/github/stars/simular-ai/Agent-S"></a> | A Compositional Generalist-Specialist Framework for Computer Use Agents. |
| [pdf-craft](https://github.com/oomol-lab/pdf-craft) <a href="https://github.com/oomol-lab/pdf-craft"><img src="https://img.shields.io/github/stars/oomol-lab/pdf-craft"></a> | PDF Craft can convert PDF files into various other formats. |
| [VV](https://github.com/Cicada000/VV) <a href="https://github.com/Cicada000/VV"><img src="https://img.shields.io/github/stars/Cicada000/VV"></a> | Zhang Weiwei Quotations Search Project. |
| [docetl](https://github.com/ucbepic/docetl) <a href="https://github.com/ucbepic/docetl"><img src="https://img.shields.io/github/stars/ucbepic/docetl"></a> | DocETL is a tool for creating and executing data processing pipelines, especially suited for complex document processing tasks. |
| [ZenlessZoneZero-Auto](https://github.com/sMythicalBird/ZenlessZoneZero-Auto) <a href="https://github.com/sMythicalBird/ZenlessZoneZero-Auto"><img src="https://img.shields.io/github/stars/sMythicalBird/ZenlessZoneZero-Auto"></a> | Zenless Zone Zero Automation Framework. |
| [Yuxi-Know](https://github.com/xerrors/Yuxi-Know) <a href="https://github.com/xerrors/Yuxi-Know"><img src="https://img.shields.io/github/stars/xerrors/Yuxi-Know"></a> | Knowledge graph question answering system based on LLMs. |
| [python-office](https://github.com/CoderWanFeng/python-office) <a href="https://github.com/CoderWanFeng/python-office"><img src="https://img.shields.io/github/stars/CoderWanFeng/python-office"></a> | Python tool for office works. |
| [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR) <a href="https://github.com/jingsongliujing/OnnxOCR"><img src="https://img.shields.io/github/stars/jingsongliujing/OnnxOCR"></a>|A lightweight OCR system based on PaddleOCR, decoupled from the PaddlePaddle deep learning training framework, with ultra-fast inference speed |
| ... |... |

@ -0,0 +1,2 @@
*.html linguist-language=python
*.ipynb linguist-language=python

@ -0,0 +1,16 @@
.DS_Store
*.pth
*.pyc
*.pyo
*.log
*.tmp
*.pkl
__pycache__/
.idea/
output/
test/*.jpg
datasets/
index/
train_log/
log/
profiling_log/

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

@ -0,0 +1,132 @@
# Real-time Scene Text Detection with Differentiable Binarization
**note**: some code is inherited from [WenmuZhou/DBNet.pytorch](https://github.com/WenmuZhou/DBNet.pytorch)
[中文解读](https://zhuanlan.zhihu.com/p/94677957)
![network](imgs/paper/db.jpg)
## update
2020-06-07: 添加灰度图训练,训练灰度图时需要在配置里移除`dataset.args.transforms.Normalize`
## Install Using Conda
```
conda env create -f environment.yml
git clone https://github.com/WenmuZhou/DBNet.paddle.git
cd DBNet.paddle/
```
or
## Install Manually
```bash
conda create -n dbnet python=3.6
conda activate dbnet
conda install ipython pip
# python dependencies
pip install -r requirement.txt
# clone repo
git clone https://github.com/WenmuZhou/DBNet.paddle.git
cd DBNet.paddle/
```
## Requirements
* paddlepaddle 2.4+
## Download
TBD
## Data Preparation
Training data: prepare a text `train.txt` in the following format, use '\t' as a separator
```
./datasets/train/img/001.jpg ./datasets/train/gt/001.txt
```
Validation data: prepare a text `test.txt` in the following format, use '\t' as a separator
```
./datasets/test/img/001.jpg ./datasets/test/gt/001.txt
```
- Store images in the `img` folder
- Store groundtruth in the `gt` folder
The groundtruth can be `.txt` files, with the following format:
```
x1, y1, x2, y2, x3, y3, x4, y4, annotation
```
## Train
1. config the `dataset['train']['dataset'['data_path']'`,`dataset['validate']['dataset'['data_path']`in [config/icdar2015_resnet18_fpn_DBhead_polyLR.yaml](cconfig/icdar2015_resnet18_fpn_DBhead_polyLR.yaml)
* . single gpu train
```bash
bash single_gpu_train.sh
```
* . Multi-gpu training
```bash
bash multi_gpu_train.sh
```
## Test
[eval.py](tools/eval.py) is used to test model on test dataset
1. config `model_path` in [eval.sh](eval.sh)
2. use following script to test
```bash
bash eval.sh
```
## Predict
[predict.py](tools/predict.py) Can be used to inference on all images in a folder
1. config `model_path`,`input_folder`,`output_folder` in [predict.sh](predict.sh)
2. use following script to predict
```
bash predict.sh
```
You can change the `model_path` in the `predict.sh` file to your model location.
tips: if result is not good, you can change `thre` in [predict.sh](predict.sh)
## Export Model
[export_model.py](tools/export_model.py) Can be used to inference on all images in a folder
use following script to export inference model
```
python tools/export_model.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o trainer.resume_checkpoint=model_best.pth trainer.output_dir=output/infer
```
## Paddle Inference infer
[infer.py](tools/infer.py) Can be used to inference on all images in a folder
use following script to export inference model
```
python tools/infer.py --model-dir=output/infer/ --img-path imgs/paper/db.jpg
```
<h2 id="Performance">Performance</h2>
### [ICDAR 2015](http://rrc.cvc.uab.es/?ch=4)
only train on ICDAR2015 dataset
| Method | image size (short size) |learning rate | Precision (%) | Recall (%) | F-measure (%) | FPS |
|:--------------------------:|:-------:|:--------:|:--------:|:------------:|:---------------:|:-----:|
| ImageNet-resnet50-FPN-DBHeadtorch |736 |1e-3|90.19 | 78.14 | 83.88 | 27 |
| ImageNet-resnet50-FPN-DBHeadpaddle |736 |1e-3| 89.47 | 79.03 | 83.92 | 27 |
| ImageNet-resnet50-FPN-DBHeadpaddle_amp |736 |1e-3| 88.62 | 79.95 | 84.06 | 27 |
### examples
TBD
### reference
1. https://arxiv.org/pdf/1911.08947.pdf
2. https://github.com/WenmuZhou/DBNet.pytorch
**If this repository helps youplease star it. Thanks.**

@ -0,0 +1,2 @@
from .base_trainer import BaseTrainer
from .base_dataset import BaseDataSet

@ -0,0 +1,86 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/4 13:12
# @Author : zhoujun
import copy
from paddle.io import Dataset
from data_loader.modules import *
class BaseDataSet(Dataset):
def __init__(
self,
data_path: str,
img_mode,
pre_processes,
filter_keys,
ignore_tags,
transform=None,
target_transform=None,
):
assert img_mode in ["RGB", "BRG", "GRAY"]
self.ignore_tags = ignore_tags
self.data_list = self.load_data(data_path)
item_keys = ["img_path", "img_name", "text_polys", "texts", "ignore_tags"]
for item in item_keys:
assert (
item in self.data_list[0]
), "data_list from load_data must contains {}".format(item_keys)
self.img_mode = img_mode
self.filter_keys = filter_keys
self.transform = transform
self.target_transform = target_transform
self._init_pre_processes(pre_processes)
def _init_pre_processes(self, pre_processes):
self.aug = []
if pre_processes is not None:
for aug in pre_processes:
if "args" not in aug:
args = {}
else:
args = aug["args"]
if isinstance(args, dict):
cls = eval(aug["type"])(**args)
else:
cls = eval(aug["type"])(args)
self.aug.append(cls)
def load_data(self, data_path: str) -> list:
"""
把数据加载为一个list
:params data_path: 存储数据的文件夹或者文件
return a dict ,包含了'img_path','img_name','text_polys','texts','ignore_tags'
"""
raise NotImplementedError
def apply_pre_processes(self, data):
for aug in self.aug:
data = aug(data)
return data
def __getitem__(self, index):
try:
data = copy.deepcopy(self.data_list[index])
im = cv2.imread(data["img_path"], 1 if self.img_mode != "GRAY" else 0)
if self.img_mode == "RGB":
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
data["img"] = im
data["shape"] = [im.shape[0], im.shape[1]]
data = self.apply_pre_processes(data)
if self.transform:
data["img"] = self.transform(data["img"])
data["text_polys"] = data["text_polys"].tolist()
if len(self.filter_keys):
data_dict = {}
for k, v in data.items():
if k not in self.filter_keys:
data_dict[k] = v
return data_dict
else:
return data
except:
return self.__getitem__(np.random.randint(self.__len__()))
def __len__(self):
return len(self.data_list)

@ -0,0 +1,269 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:50
# @Author : zhoujun
import os
import pathlib
import shutil
from pprint import pformat
import anyconfig
import paddle
import numpy as np
import random
from paddle.jit import to_static
from paddle.static import InputSpec
from utils import setup_logger
class BaseTrainer:
def __init__(
self,
config,
model,
criterion,
train_loader,
validate_loader,
metric_cls,
post_process=None,
):
config["trainer"]["output_dir"] = os.path.join(
str(pathlib.Path(os.path.abspath(__name__)).parent),
config["trainer"]["output_dir"],
)
config["name"] = config["name"] + "_" + model.name
self.save_dir = config["trainer"]["output_dir"]
self.checkpoint_dir = os.path.join(self.save_dir, "checkpoint")
os.makedirs(self.checkpoint_dir, exist_ok=True)
self.global_step = 0
self.start_epoch = 0
self.config = config
self.criterion = criterion
# logger and tensorboard
self.visualdl_enable = self.config["trainer"].get("visual_dl", False)
self.epochs = self.config["trainer"]["epochs"]
self.log_iter = self.config["trainer"]["log_iter"]
if paddle.distributed.get_rank() == 0:
anyconfig.dump(config, os.path.join(self.save_dir, "config.yaml"))
self.logger = setup_logger(os.path.join(self.save_dir, "train.log"))
self.logger_info(pformat(self.config))
self.model = self.apply_to_static(model)
# device
if (
paddle.device.cuda.device_count() > 0
and paddle.device.is_compiled_with_cuda()
):
self.with_cuda = True
random.seed(self.config["trainer"]["seed"])
np.random.seed(self.config["trainer"]["seed"])
paddle.seed(self.config["trainer"]["seed"])
else:
self.with_cuda = False
self.logger_info("train with and paddle {}".format(paddle.__version__))
# metrics
self.metrics = {
"recall": 0,
"precision": 0,
"hmean": 0,
"train_loss": float("inf"),
"best_model_epoch": 0,
}
self.train_loader = train_loader
if validate_loader is not None:
assert post_process is not None and metric_cls is not None
self.validate_loader = validate_loader
self.post_process = post_process
self.metric_cls = metric_cls
self.train_loader_len = len(train_loader)
if self.validate_loader is not None:
self.logger_info(
"train dataset has {} samples,{} in dataloader, validate dataset has {} samples,{} in dataloader".format(
len(self.train_loader.dataset),
self.train_loader_len,
len(self.validate_loader.dataset),
len(self.validate_loader),
)
)
else:
self.logger_info(
"train dataset has {} samples,{} in dataloader".format(
len(self.train_loader.dataset), self.train_loader_len
)
)
self._initialize_scheduler()
self._initialize_optimizer()
# resume or finetune
if self.config["trainer"]["resume_checkpoint"] != "":
self._load_checkpoint(
self.config["trainer"]["resume_checkpoint"], resume=True
)
elif self.config["trainer"]["finetune_checkpoint"] != "":
self._load_checkpoint(
self.config["trainer"]["finetune_checkpoint"], resume=False
)
if self.visualdl_enable and paddle.distributed.get_rank() == 0:
from visualdl import LogWriter
self.writer = LogWriter(self.save_dir)
# 混合精度训练
self.amp = self.config.get("amp", None)
if self.amp == "None":
self.amp = None
if self.amp:
self.amp["scaler"] = paddle.amp.GradScaler(
init_loss_scaling=self.amp.get("scale_loss", 1024),
use_dynamic_loss_scaling=self.amp.get("use_dynamic_loss_scaling", True),
)
self.model, self.optimizer = paddle.amp.decorate(
models=self.model,
optimizers=self.optimizer,
level=self.amp.get("amp_level", "O2"),
)
# 分布式训练
if paddle.device.cuda.device_count() > 1:
self.model = paddle.DataParallel(self.model)
# make inverse Normalize
self.UN_Normalize = False
for t in self.config["dataset"]["train"]["dataset"]["args"]["transforms"]:
if t["type"] == "Normalize":
self.normalize_mean = t["args"]["mean"]
self.normalize_std = t["args"]["std"]
self.UN_Normalize = True
def apply_to_static(self, model):
support_to_static = self.config["trainer"].get("to_static", False)
if support_to_static:
specs = None
print("static")
specs = [InputSpec([None, 3, -1, -1])]
model = to_static(model, input_spec=specs)
self.logger_info(
"Successfully to apply @to_static with specs: {}".format(specs)
)
return model
def train(self):
"""
Full training logic
"""
for epoch in range(self.start_epoch + 1, self.epochs + 1):
self.epoch_result = self._train_epoch(epoch)
self._on_epoch_finish()
if paddle.distributed.get_rank() == 0 and self.visualdl_enable:
self.writer.close()
self._on_train_finish()
def _train_epoch(self, epoch):
"""
Training logic for an epoch
:param epoch: Current epoch number
"""
raise NotImplementedError
def _eval(self, epoch):
"""
eval logic for an epoch
:param epoch: Current epoch number
"""
raise NotImplementedError
def _on_epoch_finish(self):
raise NotImplementedError
def _on_train_finish(self):
raise NotImplementedError
def _save_checkpoint(self, epoch, file_name):
"""
Saving checkpoints
:param epoch: current epoch number
:param log: logging information of the epoch
:param save_best: if True, rename the saved checkpoint to 'model_best.pth.tar'
"""
state_dict = self.model.state_dict()
state = {
"epoch": epoch,
"global_step": self.global_step,
"state_dict": state_dict,
"optimizer": self.optimizer.state_dict(),
"config": self.config,
"metrics": self.metrics,
}
filename = os.path.join(self.checkpoint_dir, file_name)
paddle.save(state, filename)
def _load_checkpoint(self, checkpoint_path, resume):
"""
Resume from saved checkpoints
:param checkpoint_path: Checkpoint path to be resumed
"""
self.logger_info("Loading checkpoint: {} ...".format(checkpoint_path))
checkpoint = paddle.load(checkpoint_path)
self.model.set_state_dict(checkpoint["state_dict"])
if resume:
self.global_step = checkpoint["global_step"]
self.start_epoch = checkpoint["epoch"]
self.config["lr_scheduler"]["args"]["last_epoch"] = self.start_epoch
# self.scheduler.load_state_dict(checkpoint['scheduler'])
self.optimizer.set_state_dict(checkpoint["optimizer"])
if "metrics" in checkpoint:
self.metrics = checkpoint["metrics"]
self.logger_info(
"resume from checkpoint {} (epoch {})".format(
checkpoint_path, self.start_epoch
)
)
else:
self.logger_info("finetune from checkpoint {}".format(checkpoint_path))
def _initialize(self, name, module, *args, **kwargs):
module_name = self.config[name]["type"]
module_args = self.config[name].get("args", {})
assert all(
[k not in module_args for k in kwargs]
), "Overwriting kwargs given in config file is not allowed"
module_args.update(kwargs)
return getattr(module, module_name)(*args, **module_args)
def _initialize_scheduler(self):
self.lr_scheduler = self._initialize("lr_scheduler", paddle.optimizer.lr)
def _initialize_optimizer(self):
self.optimizer = self._initialize(
"optimizer",
paddle.optimizer,
parameters=self.model.parameters(),
learning_rate=self.lr_scheduler,
)
def inverse_normalize(self, batch_img):
if self.UN_Normalize:
batch_img[:, 0, :, :] = (
batch_img[:, 0, :, :] * self.normalize_std[0] + self.normalize_mean[0]
)
batch_img[:, 1, :, :] = (
batch_img[:, 1, :, :] * self.normalize_std[1] + self.normalize_mean[1]
)
batch_img[:, 2, :, :] = (
batch_img[:, 2, :, :] * self.normalize_std[2] + self.normalize_mean[2]
)
def logger_info(self, s):
if paddle.distributed.get_rank() == 0:
self.logger.info(s)

@ -0,0 +1,40 @@
name: DBNet
dataset:
train:
dataset:
type: SynthTextDataset # 数据集类型
args:
data_path: ''# SynthTextDataset 根目录
pre_processes: # 数据的预处理过程包含augment和标签制作
- type: IaaAugment # 使用imgaug进行变换
args:
- {'type':Fliplr, 'args':{'p':0.5}}
- {'type': Affine, 'args':{'rotate':[-10,10]}}
- {'type':Resize,'args':{'size':[0.5,3]}}
- type: EastRandomCropData
args:
size: [640,640]
max_tries: 50
keep_ratio: true
- type: MakeBorderMap
args:
shrink_ratio: 0.4
- type: MakeShrinkMap
args:
shrink_ratio: 0.4
min_text_size: 8
transforms: # 对图片进行的变换方式
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
filter_keys: ['img_path','img_name','text_polys','texts','ignore_tags','shape'] # 返回数据之前从数据字典里删除的key
ignore_tags: ['*', '###']
loader:
batch_size: 1
shuffle: true
num_workers: 0
collate_fn: ''

@ -0,0 +1,65 @@
name: DBNet
base: ['config/SynthText.yaml']
arch:
type: Model
backbone:
type: resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 10
show_images_iter: 50
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path: ./datasets/SynthText
img_mode: RGB
loader:
batch_size: 2
shuffle: true
num_workers: 6
collate_fn: ''

@ -0,0 +1,69 @@
name: DBNet
dataset:
train:
dataset:
type: ICDAR2015Dataset # 数据集类型
args:
data_path: # 一个存放 img_path \t gt_path的文件
- ''
pre_processes: # 数据的预处理过程包含augment和标签制作
- type: IaaAugment # 使用imgaug进行变换
args:
- {'type':Fliplr, 'args':{'p':0.5}}
- {'type': Affine, 'args':{'rotate':[-10,10]}}
- {'type':Resize,'args':{'size':[0.5,3]}}
- type: EastRandomCropData
args:
size: [640,640]
max_tries: 50
keep_ratio: true
- type: MakeBorderMap
args:
shrink_ratio: 0.4
thresh_min: 0.3
thresh_max: 0.7
- type: MakeShrinkMap
args:
shrink_ratio: 0.4
min_text_size: 8
transforms: # 对图片进行的变换方式
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
filter_keys: [img_path,img_name,text_polys,texts,ignore_tags,shape] # 返回数据之前从数据字典里删除的key
ignore_tags: ['*', '###']
loader:
batch_size: 1
shuffle: true
num_workers: 0
collate_fn: ''
validate:
dataset:
type: ICDAR2015Dataset
args:
data_path:
- ''
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
transforms:
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
filter_keys: []
ignore_tags: ['*', '###']
loader:
batch_size: 1
shuffle: true
num_workers: 0
collate_fn: ICDARCollectFN

@ -0,0 +1,82 @@
name: DBNet
base: ['config/icdar2015.yaml']
arch:
type: Model
backbone:
type: deformable_resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 10
show_images_iter: 50
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.txt
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.txt
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

@ -0,0 +1,82 @@
name: DBNet
base: ['config/icdar2015.yaml']
arch:
type: Model
backbone:
type: resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 10
show_images_iter: 50
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.txt
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.txt
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

@ -0,0 +1,83 @@
name: DBNet
base: ['config/icdar2015.yaml']
arch:
type: Model
backbone:
type: resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: StepLR
args:
step_size: 10
gama: 0.8
trainer:
seed: 2
epochs: 500
log_iter: 10
show_images_iter: 50
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.txt
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.txt
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

@ -0,0 +1,79 @@
name: DBNet
base: ['config/icdar2015.yaml']
arch:
type: Model
backbone:
type: resnet50
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
lr_scheduler:
type: Polynomial
args:
learning_rate: 0.001
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 10
show_images_iter: 50
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output/fp16_o2
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.txt
img_mode: RGB
loader:
batch_size: 16
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.txt
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

@ -0,0 +1,73 @@
name: DBNet
dataset:
train:
dataset:
type: DetDataset # 数据集类型
args:
data_path: # 一个存放 img_path \t gt_path的文件
- ''
pre_processes: # 数据的预处理过程包含augment和标签制作
- type: IaaAugment # 使用imgaug进行变换
args:
- {'type':Fliplr, 'args':{'p':0.5}}
- {'type': Affine, 'args':{'rotate':[-10,10]}}
- {'type':Resize,'args':{'size':[0.5,3]}}
- type: EastRandomCropData
args:
size: [640,640]
max_tries: 50
keep_ratio: true
- type: MakeBorderMap
args:
shrink_ratio: 0.4
thresh_min: 0.3
thresh_max: 0.7
- type: MakeShrinkMap
args:
shrink_ratio: 0.4
min_text_size: 8
transforms: # 对图片进行的变换方式
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
load_char_annotation: false
expand_one_char: false
filter_keys: [img_path,img_name,text_polys,texts,ignore_tags,shape] # 返回数据之前从数据字典里删除的key
ignore_tags: ['*', '###']
loader:
batch_size: 1
shuffle: true
num_workers: 0
collate_fn: ''
validate:
dataset:
type: DetDataset
args:
data_path:
- ''
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
transforms:
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
load_char_annotation: false # 是否加载字符级标注
expand_one_char: false # 是否对只有一个字符的框进行宽度扩充扩充后w = w+h
filter_keys: []
ignore_tags: ['*', '###']
loader:
batch_size: 1
shuffle: true
num_workers: 0
collate_fn: ICDARCollectFN

@ -0,0 +1,86 @@
name: DBNet
base: ['config/open_dataset.yaml']
arch:
type: Model
backbone:
type: deformable_resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 1
show_images_iter: 1
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.json
img_mode: RGB
load_char_annotation: false
expand_one_char: false
loader:
batch_size: 2
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.json
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
load_char_annotation: false
expand_one_char: false
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

@ -0,0 +1,86 @@
name: DBNet
base: ['config/open_dataset.yaml']
arch:
type: Model
backbone:
type: resnest50
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 1
show_images_iter: 1
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.json
img_mode: RGB
load_char_annotation: false
expand_one_char: false
loader:
batch_size: 2
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.json
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
load_char_annotation: false
expand_one_char: false
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

@ -0,0 +1,93 @@
name: DBNet
base: ['config/open_dataset.yaml']
arch:
type: Model
backbone:
type: resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 1
show_images_iter: 1
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.json
transforms: # 对图片进行的变换方式
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
load_char_annotation: false
expand_one_char: false
loader:
batch_size: 2
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.json
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
load_char_annotation: false
expand_one_char: false
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:52
# @Author : zhoujun
import copy
import PIL
import numpy as np
import paddle
from paddle.io import DataLoader, DistributedBatchSampler, BatchSampler
from paddle.vision import transforms
def get_dataset(data_path, module_name, transform, dataset_args):
"""
获取训练dataset
:param data_path: dataset文件列表每个文件内以如下格式存储 path/to/img\tlabel
:param module_name: 所使用的自定义dataset名称目前只支持data_loaders.ImageDataset
:param transform: 该数据集使用的transforms
:param dataset_args: module_name的参数
:return: 如果data_path列表不为空返回对于的ConcatDataset对象否则None
"""
from . import dataset
s_dataset = getattr(dataset, module_name)(
transform=transform, data_path=data_path, **dataset_args
)
return s_dataset
def get_transforms(transforms_config):
tr_list = []
for item in transforms_config:
if "args" not in item:
args = {}
else:
args = item["args"]
cls = getattr(transforms, item["type"])(**args)
tr_list.append(cls)
tr_list = transforms.Compose(tr_list)
return tr_list
class ICDARCollectFN:
def __init__(self, *args, **kwargs):
pass
def __call__(self, batch):
data_dict = {}
to_tensor_keys = []
for sample in batch:
for k, v in sample.items():
if k not in data_dict:
data_dict[k] = []
if isinstance(v, (np.ndarray, paddle.Tensor, PIL.Image.Image)):
if k not in to_tensor_keys:
to_tensor_keys.append(k)
data_dict[k].append(v)
for k in to_tensor_keys:
data_dict[k] = paddle.stack(data_dict[k], 0)
return data_dict
def get_dataloader(module_config, distributed=False):
if module_config is None:
return None
config = copy.deepcopy(module_config)
dataset_args = config["dataset"]["args"]
if "transforms" in dataset_args:
img_transforms = get_transforms(dataset_args.pop("transforms"))
else:
img_transforms = None
# 创建数据集
dataset_name = config["dataset"]["type"]
data_path = dataset_args.pop("data_path")
if data_path == None:
return None
data_path = [x for x in data_path if x is not None]
if len(data_path) == 0:
return None
if (
"collate_fn" not in config["loader"]
or config["loader"]["collate_fn"] is None
or len(config["loader"]["collate_fn"]) == 0
):
config["loader"]["collate_fn"] = None
else:
config["loader"]["collate_fn"] = eval(config["loader"]["collate_fn"])()
_dataset = get_dataset(
data_path=data_path,
module_name=dataset_name,
transform=img_transforms,
dataset_args=dataset_args,
)
sampler = None
if distributed:
# 3使用DistributedSampler
batch_sampler = DistributedBatchSampler(
dataset=_dataset,
batch_size=config["loader"].pop("batch_size"),
shuffle=config["loader"].pop("shuffle"),
)
else:
batch_sampler = BatchSampler(
dataset=_dataset,
batch_size=config["loader"].pop("batch_size"),
shuffle=config["loader"].pop("shuffle"),
)
loader = DataLoader(
dataset=_dataset, batch_sampler=batch_sampler, **config["loader"]
)
return loader

@ -0,0 +1,190 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:54
# @Author : zhoujun
import pathlib
import os
import cv2
import numpy as np
import scipy.io as sio
from tqdm.auto import tqdm
from base import BaseDataSet
from utils import order_points_clockwise, get_datalist, load, expand_polygon
class ICDAR2015Dataset(BaseDataSet):
def __init__(
self,
data_path: str,
img_mode,
pre_processes,
filter_keys,
ignore_tags,
transform=None,
**kwargs,
):
super().__init__(
data_path, img_mode, pre_processes, filter_keys, ignore_tags, transform
)
def load_data(self, data_path: str) -> list:
data_list = get_datalist(data_path)
t_data_list = []
for img_path, label_path in data_list:
data = self._get_annotation(label_path)
if len(data["text_polys"]) > 0:
item = {"img_path": img_path, "img_name": pathlib.Path(img_path).stem}
item.update(data)
t_data_list.append(item)
else:
print("there is no suit bbox in {}".format(label_path))
return t_data_list
def _get_annotation(self, label_path: str) -> dict:
boxes = []
texts = []
ignores = []
with open(label_path, encoding="utf-8", mode="r") as f:
for line in f.readlines():
params = line.strip().strip("\ufeff").strip("\xef\xbb\xbf").split(",")
try:
box = order_points_clockwise(
np.array(list(map(float, params[:8]))).reshape(-1, 2)
)
if cv2.contourArea(box) > 0:
boxes.append(box)
label = params[8]
texts.append(label)
ignores.append(label in self.ignore_tags)
except:
print("load label failed on {}".format(label_path))
data = {
"text_polys": np.array(boxes),
"texts": texts,
"ignore_tags": ignores,
}
return data
class DetDataset(BaseDataSet):
def __init__(
self,
data_path: str,
img_mode,
pre_processes,
filter_keys,
ignore_tags,
transform=None,
**kwargs,
):
self.load_char_annotation = kwargs["load_char_annotation"]
self.expand_one_char = kwargs["expand_one_char"]
super().__init__(
data_path, img_mode, pre_processes, filter_keys, ignore_tags, transform
)
def load_data(self, data_path: str) -> list:
"""
从json文件中读取出 文本行的坐标和gt字符的坐标和gt
:param data_path:
:return:
"""
data_list = []
for path in data_path:
content = load(path)
for gt in tqdm(content["data_list"], desc="read file {}".format(path)):
img_path = os.path.join(content["data_root"], gt["img_name"])
polygons = []
texts = []
illegibility_list = []
language_list = []
for annotation in gt["annotations"]:
if len(annotation["polygon"]) == 0 or len(annotation["text"]) == 0:
continue
if len(annotation["text"]) > 1 and self.expand_one_char:
annotation["polygon"] = expand_polygon(annotation["polygon"])
polygons.append(annotation["polygon"])
texts.append(annotation["text"])
illegibility_list.append(annotation["illegibility"])
language_list.append(annotation["language"])
if self.load_char_annotation:
for char_annotation in annotation["chars"]:
if (
len(char_annotation["polygon"]) == 0
or len(char_annotation["char"]) == 0
):
continue
polygons.append(char_annotation["polygon"])
texts.append(char_annotation["char"])
illegibility_list.append(char_annotation["illegibility"])
language_list.append(char_annotation["language"])
data_list.append(
{
"img_path": img_path,
"img_name": gt["img_name"],
"text_polys": np.array(polygons),
"texts": texts,
"ignore_tags": illegibility_list,
}
)
return data_list
class SynthTextDataset(BaseDataSet):
def __init__(
self,
data_path: str,
img_mode,
pre_processes,
filter_keys,
transform=None,
**kwargs,
):
self.transform = transform
self.dataRoot = pathlib.Path(data_path)
if not self.dataRoot.exists():
raise FileNotFoundError("Dataset folder is not exist.")
self.targetFilePath = self.dataRoot / "gt.mat"
if not self.targetFilePath.exists():
raise FileExistsError("Target file is not exist.")
targets = {}
sio.loadmat(
self.targetFilePath,
targets,
squeeze_me=True,
struct_as_record=False,
variable_names=["imnames", "wordBB", "txt"],
)
self.imageNames = targets["imnames"]
self.wordBBoxes = targets["wordBB"]
self.transcripts = targets["txt"]
super().__init__(data_path, img_mode, pre_processes, filter_keys, transform)
def load_data(self, data_path: str) -> list:
t_data_list = []
for imageName, wordBBoxes, texts in zip(
self.imageNames, self.wordBBoxes, self.transcripts
):
item = {}
wordBBoxes = (
np.expand_dims(wordBBoxes, axis=2)
if (wordBBoxes.ndim == 2)
else wordBBoxes
)
_, _, numOfWords = wordBBoxes.shape
text_polys = wordBBoxes.reshape(
[8, numOfWords], order="F"
).T # num_words * 8
text_polys = text_polys.reshape(numOfWords, 4, 2) # num_of_words * 4 * 2
transcripts = [word for line in texts for word in line.split()]
if numOfWords != len(transcripts):
continue
item["img_path"] = str(self.dataRoot / imageName)
item["img_name"] = (self.dataRoot / imageName).stem
item["text_polys"] = text_polys
item["texts"] = transcripts
item["ignore_tags"] = [x in self.ignore_tags for x in transcripts]
t_data_list.append(item)
return t_data_list

@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/4 10:53
# @Author : zhoujun
from .iaa_augment import IaaAugment
from .augment import *
from .random_crop_data import EastRandomCropData, PSERandomCrop
from .make_border_map import MakeBorderMap
from .make_shrink_map import MakeShrinkMap

@ -0,0 +1,308 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:52
# @Author : zhoujun
import math
import numbers
import random
import cv2
import numpy as np
from skimage.util import random_noise
class RandomNoise:
def __init__(self, random_rate):
self.random_rate = random_rate
def __call__(self, data: dict):
"""
对图片加噪声
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
if random.random() > self.random_rate:
return data
data["img"] = (
random_noise(data["img"], mode="gaussian", clip=True) * 255
).astype(data["img"].dtype)
return data
class RandomScale:
def __init__(self, scales, random_rate):
"""
:param scales: 尺度
:param random_rate: 随机系数
:return:
"""
self.random_rate = random_rate
self.scales = scales
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
if random.random() > self.random_rate:
return data
im = data["img"]
text_polys = data["text_polys"]
tmp_text_polys = text_polys.copy()
rd_scale = float(np.random.choice(self.scales))
im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale)
tmp_text_polys *= rd_scale
data["img"] = im
data["text_polys"] = tmp_text_polys
return data
class RandomRotateImgBox:
def __init__(self, degrees, random_rate, same_size=False):
"""
:param degrees: 角度可以是一个数值或者list
:param random_rate: 随机系数
:param same_size: 是否保持和原图一样大
:return:
"""
if isinstance(degrees, numbers.Number):
if degrees < 0:
raise ValueError("If degrees is a single number, it must be positive.")
degrees = (-degrees, degrees)
elif (
isinstance(degrees, list)
or isinstance(degrees, tuple)
or isinstance(degrees, np.ndarray)
):
if len(degrees) != 2:
raise ValueError("If degrees is a sequence, it must be of len 2.")
degrees = degrees
else:
raise Exception("degrees must in Number or list or tuple or np.ndarray")
self.degrees = degrees
self.same_size = same_size
self.random_rate = random_rate
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
if random.random() > self.random_rate:
return data
im = data["img"]
text_polys = data["text_polys"]
# ---------------------- 旋转图像 ----------------------
w = im.shape[1]
h = im.shape[0]
angle = np.random.uniform(self.degrees[0], self.degrees[1])
if self.same_size:
nw = w
nh = h
else:
# 角度变弧度
rangle = np.deg2rad(angle)
# 计算旋转之后图像的w, h
nw = abs(np.sin(rangle) * h) + abs(np.cos(rangle) * w)
nh = abs(np.cos(rangle) * h) + abs(np.sin(rangle) * w)
# 构造仿射矩阵
rot_mat = cv2.getRotationMatrix2D((nw * 0.5, nh * 0.5), angle, 1)
# 计算原图中心点到新图中心点的偏移量
rot_move = np.dot(rot_mat, np.array([(nw - w) * 0.5, (nh - h) * 0.5, 0]))
# 更新仿射矩阵
rot_mat[0, 2] += rot_move[0]
rot_mat[1, 2] += rot_move[1]
# 仿射变换
rot_img = cv2.warpAffine(
im,
rot_mat,
(int(math.ceil(nw)), int(math.ceil(nh))),
flags=cv2.INTER_LANCZOS4,
)
# ---------------------- 矫正bbox坐标 ----------------------
# rot_mat是最终的旋转矩阵
# 获取原始bbox的四个中点然后将这四个点转换到旋转后的坐标系下
rot_text_polys = list()
for bbox in text_polys:
point1 = np.dot(rot_mat, np.array([bbox[0, 0], bbox[0, 1], 1]))
point2 = np.dot(rot_mat, np.array([bbox[1, 0], bbox[1, 1], 1]))
point3 = np.dot(rot_mat, np.array([bbox[2, 0], bbox[2, 1], 1]))
point4 = np.dot(rot_mat, np.array([bbox[3, 0], bbox[3, 1], 1]))
rot_text_polys.append([point1, point2, point3, point4])
data["img"] = rot_img
data["text_polys"] = np.array(rot_text_polys)
return data
class RandomResize:
def __init__(self, size, random_rate, keep_ratio=False):
"""
:param input_size: resize尺寸,数字或者list的形式如果为list形式就是[w,h]
:param random_rate: 随机系数
:param keep_ratio: 是否保持长宽比
:return:
"""
if isinstance(size, numbers.Number):
if size < 0:
raise ValueError(
"If input_size is a single number, it must be positive."
)
size = (size, size)
elif (
isinstance(size, list)
or isinstance(size, tuple)
or isinstance(size, np.ndarray)
):
if len(size) != 2:
raise ValueError("If input_size is a sequence, it must be of len 2.")
size = (size[0], size[1])
else:
raise Exception("input_size must in Number or list or tuple or np.ndarray")
self.size = size
self.keep_ratio = keep_ratio
self.random_rate = random_rate
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
if random.random() > self.random_rate:
return data
im = data["img"]
text_polys = data["text_polys"]
if self.keep_ratio:
# 将图片短边pad到和长边一样
h, w, c = im.shape
max_h = max(h, self.size[0])
max_w = max(w, self.size[1])
im_padded = np.zeros((max_h, max_w, c), dtype=np.uint8)
im_padded[:h, :w] = im.copy()
im = im_padded
text_polys = text_polys.astype(np.float32)
h, w, _ = im.shape
im = cv2.resize(im, self.size)
w_scale = self.size[0] / float(w)
h_scale = self.size[1] / float(h)
text_polys[:, :, 0] *= w_scale
text_polys[:, :, 1] *= h_scale
data["img"] = im
data["text_polys"] = text_polys
return data
def resize_image(img, short_size):
height, width, _ = img.shape
if height < width:
new_height = short_size
new_width = new_height / height * width
else:
new_width = short_size
new_height = new_width / width * height
new_height = int(round(new_height / 32) * 32)
new_width = int(round(new_width / 32) * 32)
resized_img = cv2.resize(img, (new_width, new_height))
return resized_img, (new_width / width, new_height / height)
class ResizeShortSize:
def __init__(self, short_size, resize_text_polys=True):
"""
:param size: resize尺寸,数字或者list的形式如果为list形式就是[w,h]
:return:
"""
self.short_size = short_size
self.resize_text_polys = resize_text_polys
def __call__(self, data: dict) -> dict:
"""
对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
im = data["img"]
text_polys = data["text_polys"]
h, w, _ = im.shape
short_edge = min(h, w)
if short_edge < self.short_size:
# 保证短边 >= short_size
scale = self.short_size / short_edge
im = cv2.resize(im, dsize=None, fx=scale, fy=scale)
scale = (scale, scale)
# im, scale = resize_image(im, self.short_size)
if self.resize_text_polys:
# text_polys *= scale
text_polys[:, 0] *= scale[0]
text_polys[:, 1] *= scale[1]
data["img"] = im
data["text_polys"] = text_polys
return data
class HorizontalFlip:
def __init__(self, random_rate):
"""
:param random_rate: 随机系数
"""
self.random_rate = random_rate
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
if random.random() > self.random_rate:
return data
im = data["img"]
text_polys = data["text_polys"]
flip_text_polys = text_polys.copy()
flip_im = cv2.flip(im, 1)
h, w, _ = flip_im.shape
flip_text_polys[:, :, 0] = w - flip_text_polys[:, :, 0]
data["img"] = flip_im
data["text_polys"] = flip_text_polys
return data
class VerticalFlip:
def __init__(self, random_rate):
"""
:param random_rate: 随机系数
"""
self.random_rate = random_rate
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
if random.random() > self.random_rate:
return data
im = data["img"]
text_polys = data["text_polys"]
flip_text_polys = text_polys.copy()
flip_im = cv2.flip(im, 0)
h, w, _ = flip_im.shape
flip_text_polys[:, :, 1] = h - flip_text_polys[:, :, 1]
data["img"] = flip_im
data["text_polys"] = flip_text_polys
return data

@ -0,0 +1,68 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/4 18:06
# @Author : zhoujun
import numpy as np
import imgaug
import imgaug.augmenters as iaa
class AugmenterBuilder(object):
def __init__(self):
pass
def build(self, args, root=True):
if args is None or len(args) == 0:
return None
elif isinstance(args, list):
if root:
sequence = [self.build(value, root=False) for value in args]
return iaa.Sequential(sequence)
else:
return getattr(iaa, args[0])(
*[self.to_tuple_if_list(a) for a in args[1:]]
)
elif isinstance(args, dict):
cls = getattr(iaa, args["type"])
return cls(**{k: self.to_tuple_if_list(v) for k, v in args["args"].items()})
else:
raise RuntimeError("unknown augmenter arg: " + str(args))
def to_tuple_if_list(self, obj):
if isinstance(obj, list):
return tuple(obj)
return obj
class IaaAugment:
def __init__(self, augmenter_args):
self.augmenter_args = augmenter_args
self.augmenter = AugmenterBuilder().build(self.augmenter_args)
def __call__(self, data):
image = data["img"]
shape = image.shape
if self.augmenter:
aug = self.augmenter.to_deterministic()
data["img"] = aug.augment_image(image)
data = self.may_augment_annotation(aug, data, shape)
return data
def may_augment_annotation(self, aug, data, shape):
if aug is None:
return data
line_polys = []
for poly in data["text_polys"]:
new_poly = self.may_augment_poly(aug, shape, poly)
line_polys.append(new_poly)
data["text_polys"] = np.array(line_polys)
return data
def may_augment_poly(self, aug, img_shape, poly):
keypoints = [imgaug.Keypoint(p[0], p[1]) for p in poly]
keypoints = aug.augment_keypoints(
[imgaug.KeypointsOnImage(keypoints, shape=img_shape)]
)[0].keypoints
poly = [(p.x, p.y) for p in keypoints]
return poly

@ -0,0 +1,159 @@
import cv2
import numpy as np
np.seterr(divide="ignore", invalid="ignore")
import pyclipper
from shapely.geometry import Polygon
class MakeBorderMap:
def __init__(self, shrink_ratio=0.4, thresh_min=0.3, thresh_max=0.7):
self.shrink_ratio = shrink_ratio
self.thresh_min = thresh_min
self.thresh_max = thresh_max
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
im = data["img"]
text_polys = data["text_polys"]
ignore_tags = data["ignore_tags"]
canvas = np.zeros(im.shape[:2], dtype=np.float32)
mask = np.zeros(im.shape[:2], dtype=np.float32)
for i in range(len(text_polys)):
if ignore_tags[i]:
continue
self.draw_border_map(text_polys[i], canvas, mask=mask)
canvas = canvas * (self.thresh_max - self.thresh_min) + self.thresh_min
data["threshold_map"] = canvas
data["threshold_mask"] = mask
return data
def draw_border_map(self, polygon, canvas, mask):
polygon = np.array(polygon)
assert polygon.ndim == 2
assert polygon.shape[1] == 2
polygon_shape = Polygon(polygon)
if polygon_shape.area <= 0:
return
distance = (
polygon_shape.area
* (1 - np.power(self.shrink_ratio, 2))
/ polygon_shape.length
)
subject = [tuple(l) for l in polygon]
padding = pyclipper.PyclipperOffset()
padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
padded_polygon = np.array(padding.Execute(distance)[0])
cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
xmin = padded_polygon[:, 0].min()
xmax = padded_polygon[:, 0].max()
ymin = padded_polygon[:, 1].min()
ymax = padded_polygon[:, 1].max()
width = xmax - xmin + 1
height = ymax - ymin + 1
polygon[:, 0] = polygon[:, 0] - xmin
polygon[:, 1] = polygon[:, 1] - ymin
xs = np.broadcast_to(
np.linspace(0, width - 1, num=width).reshape(1, width), (height, width)
)
ys = np.broadcast_to(
np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width)
)
distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
for i in range(polygon.shape[0]):
j = (i + 1) % polygon.shape[0]
absolute_distance = self.distance(xs, ys, polygon[i], polygon[j])
distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
distance_map = distance_map.min(axis=0)
xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
canvas[ymin_valid : ymax_valid + 1, xmin_valid : xmax_valid + 1] = np.fmax(
1
- distance_map[
ymin_valid - ymin : ymax_valid - ymax + height,
xmin_valid - xmin : xmax_valid - xmax + width,
],
canvas[ymin_valid : ymax_valid + 1, xmin_valid : xmax_valid + 1],
)
def distance(self, xs, ys, point_1, point_2):
"""
compute the distance from point to a line
ys: coordinates in the first axis
xs: coordinates in the second axis
point_1, point_2: (x, y), the end of the line
"""
height, width = xs.shape[:2]
square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[1])
square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[1])
square_distance = np.square(point_1[0] - point_2[0]) + np.square(
point_1[1] - point_2[1]
)
cosin = (square_distance - square_distance_1 - square_distance_2) / (
2 * np.sqrt(square_distance_1 * square_distance_2)
)
square_sin = 1 - np.square(cosin)
square_sin = np.nan_to_num(square_sin)
result = np.sqrt(
square_distance_1 * square_distance_2 * square_sin / square_distance
)
result[cosin < 0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[
cosin < 0
]
# self.extend_line(point_1, point_2, result)
return result
def extend_line(self, point_1, point_2, result):
ex_point_1 = (
int(
round(point_1[0] + (point_1[0] - point_2[0]) * (1 + self.shrink_ratio))
),
int(
round(point_1[1] + (point_1[1] - point_2[1]) * (1 + self.shrink_ratio))
),
)
cv2.line(
result,
tuple(ex_point_1),
tuple(point_1),
4096.0,
1,
lineType=cv2.LINE_AA,
shift=0,
)
ex_point_2 = (
int(
round(point_2[0] + (point_2[0] - point_1[0]) * (1 + self.shrink_ratio))
),
int(
round(point_2[1] + (point_2[1] - point_1[1]) * (1 + self.shrink_ratio))
),
)
cv2.line(
result,
tuple(ex_point_2),
tuple(point_2),
4096.0,
1,
lineType=cv2.LINE_AA,
shift=0,
)
return ex_point_1, ex_point_2

@ -0,0 +1,129 @@
import numpy as np
import cv2
def shrink_polygon_py(polygon, shrink_ratio):
"""
对框进行缩放返回去的比例为1/shrink_ratio 即可
"""
cx = polygon[:, 0].mean()
cy = polygon[:, 1].mean()
polygon[:, 0] = cx + (polygon[:, 0] - cx) * shrink_ratio
polygon[:, 1] = cy + (polygon[:, 1] - cy) * shrink_ratio
return polygon
def shrink_polygon_pyclipper(polygon, shrink_ratio):
from shapely.geometry import Polygon
import pyclipper
polygon_shape = Polygon(polygon)
distance = (
polygon_shape.area * (1 - np.power(shrink_ratio, 2)) / polygon_shape.length
)
subject = [tuple(l) for l in polygon]
padding = pyclipper.PyclipperOffset()
padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
shrunk = padding.Execute(-distance)
if shrunk == []:
shrunk = np.array(shrunk)
else:
shrunk = np.array(shrunk[0]).reshape(-1, 2)
return shrunk
class MakeShrinkMap:
r"""
Making binary mask from detection data with ICDAR format.
Typically following the process of class `MakeICDARData`.
"""
def __init__(self, min_text_size=8, shrink_ratio=0.4, shrink_type="pyclipper"):
shrink_func_dict = {
"py": shrink_polygon_py,
"pyclipper": shrink_polygon_pyclipper,
}
self.shrink_func = shrink_func_dict[shrink_type]
self.min_text_size = min_text_size
self.shrink_ratio = shrink_ratio
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
image = data["img"]
text_polys = data["text_polys"]
ignore_tags = data["ignore_tags"]
h, w = image.shape[:2]
text_polys, ignore_tags = self.validate_polygons(text_polys, ignore_tags, h, w)
gt = np.zeros((h, w), dtype=np.float32)
mask = np.ones((h, w), dtype=np.float32)
for i in range(len(text_polys)):
polygon = text_polys[i]
height = max(polygon[:, 1]) - min(polygon[:, 1])
width = max(polygon[:, 0]) - min(polygon[:, 0])
if ignore_tags[i] or min(height, width) < self.min_text_size:
cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
ignore_tags[i] = True
else:
shrunk = self.shrink_func(polygon, self.shrink_ratio)
if shrunk.size == 0:
cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
ignore_tags[i] = True
continue
cv2.fillPoly(gt, [shrunk.astype(np.int32)], 1)
data["shrink_map"] = gt
data["shrink_mask"] = mask
return data
def validate_polygons(self, polygons, ignore_tags, h, w):
"""
polygons (numpy.array, required): of shape (num_instances, num_points, 2)
"""
if len(polygons) == 0:
return polygons, ignore_tags
assert len(polygons) == len(ignore_tags)
for polygon in polygons:
polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1)
polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1)
for i in range(len(polygons)):
area = self.polygon_area(polygons[i])
if abs(area) < 1:
ignore_tags[i] = True
if area > 0:
polygons[i] = polygons[i][::-1, :]
return polygons, ignore_tags
def polygon_area(self, polygon):
return cv2.contourArea(polygon)
# edge = 0
# for i in range(polygon.shape[0]):
# next_index = (i + 1) % polygon.shape[0]
# edge += (polygon[next_index, 0] - polygon[i, 0]) * (polygon[next_index, 1] - polygon[i, 1])
#
# return edge / 2.
if __name__ == "__main__":
from shapely.geometry import Polygon
import pyclipper
polygon = np.array([[0, 0], [100, 10], [100, 100], [10, 90]])
a = shrink_polygon_py(polygon, 0.4)
print(a)
print(shrink_polygon_py(a, 1 / 0.4))
b = shrink_polygon_pyclipper(polygon, 0.4)
print(b)
poly = Polygon(b)
distance = poly.area * 1.5 / poly.length
offset = pyclipper.PyclipperOffset()
offset.AddPath(b, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
expanded = np.array(offset.Execute(distance))
bounding_box = cv2.minAreaRect(expanded)
points = cv2.boxPoints(bounding_box)
print(points)

@ -0,0 +1,211 @@
import random
import cv2
import numpy as np
# random crop algorithm similar to https://github.com/argman/EAST
class EastRandomCropData:
def __init__(
self,
size=(640, 640),
max_tries=50,
min_crop_side_ratio=0.1,
require_original_image=False,
keep_ratio=True,
):
self.size = size
self.max_tries = max_tries
self.min_crop_side_ratio = min_crop_side_ratio
self.require_original_image = require_original_image
self.keep_ratio = keep_ratio
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
im = data["img"]
text_polys = data["text_polys"]
ignore_tags = data["ignore_tags"]
texts = data["texts"]
all_care_polys = [text_polys[i] for i, tag in enumerate(ignore_tags) if not tag]
# 计算crop区域
crop_x, crop_y, crop_w, crop_h = self.crop_area(im, all_care_polys)
# crop 图片 保持比例填充
scale_w = self.size[0] / crop_w
scale_h = self.size[1] / crop_h
scale = min(scale_w, scale_h)
h = int(crop_h * scale)
w = int(crop_w * scale)
if self.keep_ratio:
if len(im.shape) == 3:
padimg = np.zeros((self.size[1], self.size[0], im.shape[2]), im.dtype)
else:
padimg = np.zeros((self.size[1], self.size[0]), im.dtype)
padimg[:h, :w] = cv2.resize(
im[crop_y : crop_y + crop_h, crop_x : crop_x + crop_w], (w, h)
)
img = padimg
else:
img = cv2.resize(
im[crop_y : crop_y + crop_h, crop_x : crop_x + crop_w], tuple(self.size)
)
# crop 文本框
text_polys_crop = []
ignore_tags_crop = []
texts_crop = []
for poly, text, tag in zip(text_polys, texts, ignore_tags):
poly = ((poly - (crop_x, crop_y)) * scale).tolist()
if not self.is_poly_outside_rect(poly, 0, 0, w, h):
text_polys_crop.append(poly)
ignore_tags_crop.append(tag)
texts_crop.append(text)
data["img"] = img
data["text_polys"] = np.float32(text_polys_crop)
data["ignore_tags"] = ignore_tags_crop
data["texts"] = texts_crop
return data
def is_poly_in_rect(self, poly, x, y, w, h):
poly = np.array(poly)
if poly[:, 0].min() < x or poly[:, 0].max() > x + w:
return False
if poly[:, 1].min() < y or poly[:, 1].max() > y + h:
return False
return True
def is_poly_outside_rect(self, poly, x, y, w, h):
poly = np.array(poly)
if poly[:, 0].max() < x or poly[:, 0].min() > x + w:
return True
if poly[:, 1].max() < y or poly[:, 1].min() > y + h:
return True
return False
def split_regions(self, axis):
regions = []
min_axis = 0
for i in range(1, axis.shape[0]):
if axis[i] != axis[i - 1] + 1:
region = axis[min_axis:i]
min_axis = i
regions.append(region)
return regions
def random_select(self, axis, max_size):
xx = np.random.choice(axis, size=2)
xmin = np.min(xx)
xmax = np.max(xx)
xmin = np.clip(xmin, 0, max_size - 1)
xmax = np.clip(xmax, 0, max_size - 1)
return xmin, xmax
def region_wise_random_select(self, regions, max_size):
selected_index = list(np.random.choice(len(regions), 2))
selected_values = []
for index in selected_index:
axis = regions[index]
xx = int(np.random.choice(axis, size=1))
selected_values.append(xx)
xmin = min(selected_values)
xmax = max(selected_values)
return xmin, xmax
def crop_area(self, im, text_polys):
h, w = im.shape[:2]
h_array = np.zeros(h, dtype=np.int32)
w_array = np.zeros(w, dtype=np.int32)
for points in text_polys:
points = np.round(points, decimals=0).astype(np.int32)
minx = np.min(points[:, 0])
maxx = np.max(points[:, 0])
w_array[minx:maxx] = 1
miny = np.min(points[:, 1])
maxy = np.max(points[:, 1])
h_array[miny:maxy] = 1
# ensure the cropped area not across a text
h_axis = np.where(h_array == 0)[0]
w_axis = np.where(w_array == 0)[0]
if len(h_axis) == 0 or len(w_axis) == 0:
return 0, 0, w, h
h_regions = self.split_regions(h_axis)
w_regions = self.split_regions(w_axis)
for i in range(self.max_tries):
if len(w_regions) > 1:
xmin, xmax = self.region_wise_random_select(w_regions, w)
else:
xmin, xmax = self.random_select(w_axis, w)
if len(h_regions) > 1:
ymin, ymax = self.region_wise_random_select(h_regions, h)
else:
ymin, ymax = self.random_select(h_axis, h)
if (
xmax - xmin < self.min_crop_side_ratio * w
or ymax - ymin < self.min_crop_side_ratio * h
):
# area too small
continue
num_poly_in_rect = 0
for poly in text_polys:
if not self.is_poly_outside_rect(
poly, xmin, ymin, xmax - xmin, ymax - ymin
):
num_poly_in_rect += 1
break
if num_poly_in_rect > 0:
return xmin, ymin, xmax - xmin, ymax - ymin
return 0, 0, w, h
class PSERandomCrop:
def __init__(self, size):
self.size = size
def __call__(self, data):
imgs = data["imgs"]
h, w = imgs[0].shape[0:2]
th, tw = self.size
if w == tw and h == th:
return imgs
# label中存在文本实例并且按照概率进行裁剪使用threshold_label_map控制
if np.max(imgs[2]) > 0 and random.random() > 3 / 8:
# 文本实例的左上角点
tl = np.min(np.where(imgs[2] > 0), axis=1) - self.size
tl[tl < 0] = 0
# 文本实例的右下角点
br = np.max(np.where(imgs[2] > 0), axis=1) - self.size
br[br < 0] = 0
# 保证选到右下角点时有足够的距离进行crop
br[0] = min(br[0], h - th)
br[1] = min(br[1], w - tw)
for _ in range(50000):
i = random.randint(tl[0], br[0])
j = random.randint(tl[1], br[1])
# 保证shrink_label_map有文本
if imgs[1][i : i + th, j : j + tw].sum() <= 0:
continue
else:
break
else:
i = random.randint(0, h - th)
j = random.randint(0, w - tw)
# return i, j, th, tw
for idx in range(len(imgs)):
if len(imgs[idx].shape) == 3:
imgs[idx] = imgs[idx][i : i + th, j : j + tw, :]
else:
imgs[idx] = imgs[idx][i : i + th, j : j + tw]
data["imgs"] = imgs
return data

@ -0,0 +1,21 @@
name: dbnet
channels:
- conda-forge
- defaults
dependencies:
- anyconfig==0.9.10
- future==0.18.2
- imgaug==0.4.0
- matplotlib==3.1.2
- numpy==1.17.4
- opencv
- pyclipper
- PyYAML==5.2
- scikit-image==0.16.2
- Shapely==1.6.4
- tensorboard=2
- tqdm==4.40.1
- ipython
- pip
- pip:
- polygon3

@ -0,0 +1 @@
CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py --model_path ''

@ -0,0 +1,17 @@
#Only use if your file names of the images and txts are identical
rm ./datasets/train_img.txt
rm ./datasets/train_gt.txt
rm ./datasets/test_img.txt
rm ./datasets/test_gt.txt
rm ./datasets/train.txt
rm ./datasets/test.txt
ls ./datasets/train/img/*.jpg > ./datasets/train_img.txt
ls ./datasets/train/gt/*.txt > ./datasets/train_gt.txt
ls ./datasets/test/img/*.jpg > ./datasets/test_img.txt
ls ./datasets/test/gt/*.txt > ./datasets/test_gt.txt
paste ./datasets/train_img.txt ./datasets/train_gt.txt > ./datasets/train.txt
paste ./datasets/test_img.txt ./datasets/test_gt.txt > ./datasets/test.txt
rm ./datasets/train_img.txt
rm ./datasets/train_gt.txt
rm ./datasets/test_img.txt
rm ./datasets/test_gt.txt

Binary file not shown.

After

Width:  |  Height:  |  Size: 190 KiB

@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:55
# @Author : zhoujun
import copy
from .model import Model
from .losses import build_loss
__all__ = ["build_loss", "build_model"]
support_model = ["Model"]
def build_model(config):
"""
get architecture model class
"""
copy_config = copy.deepcopy(config)
arch_type = copy_config.pop("type")
assert (
arch_type in support_model
), f"{arch_type} is not developed yet!, only {support_model} are support now"
arch_model = eval(arch_type)(copy_config)
return arch_model

@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:54
# @Author : zhoujun
from .resnet import *
__all__ = ["build_backbone"]
support_backbone = [
"resnet18",
"deformable_resnet18",
"deformable_resnet50",
"resnet50",
"resnet34",
"resnet101",
"resnet152",
]
def build_backbone(backbone_name, **kwargs):
assert (
backbone_name in support_backbone
), f"all support backbone is {support_backbone}"
backbone = eval(backbone_name)(**kwargs)
return backbone

@ -0,0 +1,366 @@
import math
import paddle
from paddle import nn
BatchNorm2d = nn.BatchNorm2D
__all__ = [
"ResNet",
"resnet18",
"resnet34",
"resnet50",
"resnet101",
"deformable_resnet18",
"deformable_resnet50",
"resnet152",
]
model_urls = {
"resnet18": "https://download.pytorch.org/models/resnet18-5c106cde.pth",
"resnet34": "https://download.pytorch.org/models/resnet34-333f7ec4.pth",
"resnet50": "https://download.pytorch.org/models/resnet50-19c8e357.pth",
"resnet101": "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth",
"resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth",
}
def constant_init(module, constant, bias=0):
module.weight = paddle.create_parameter(
shape=module.weight.shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(constant),
)
if hasattr(module, "bias"):
module.bias = paddle.create_parameter(
shape=module.bias.shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(bias),
)
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv2D(
in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias_attr=False
)
class BasicBlock(nn.Layer):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None):
super(BasicBlock, self).__init__()
self.with_dcn = dcn is not None
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = BatchNorm2d(planes, momentum=0.1)
self.relu = nn.ReLU()
self.with_modulated_dcn = False
if not self.with_dcn:
self.conv2 = nn.Conv2D(
planes, planes, kernel_size=3, padding=1, bias_attr=False
)
else:
from paddle.vision.ops import DeformConv2D
deformable_groups = dcn.get("deformable_groups", 1)
offset_channels = 18
self.conv2_offset = nn.Conv2D(
planes, deformable_groups * offset_channels, kernel_size=3, padding=1
)
self.conv2 = DeformConv2D(
planes, planes, kernel_size=3, padding=1, bias_attr=False
)
self.bn2 = BatchNorm2d(planes, momentum=0.1)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
# out = self.conv2(out)
if not self.with_dcn:
out = self.conv2(out)
else:
offset = self.conv2_offset(out)
out = self.conv2(out, offset)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Layer):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None):
super(Bottleneck, self).__init__()
self.with_dcn = dcn is not None
self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)
self.bn1 = BatchNorm2d(planes, momentum=0.1)
self.with_modulated_dcn = False
if not self.with_dcn:
self.conv2 = nn.Conv2D(
planes, planes, kernel_size=3, stride=stride, padding=1, bias_attr=False
)
else:
deformable_groups = dcn.get("deformable_groups", 1)
from paddle.vision.ops import DeformConv2D
offset_channels = 18
self.conv2_offset = nn.Conv2D(
planes,
deformable_groups * offset_channels,
stride=stride,
kernel_size=3,
padding=1,
)
self.conv2 = DeformConv2D(
planes, planes, kernel_size=3, padding=1, stride=stride, bias_attr=False
)
self.bn2 = BatchNorm2d(planes, momentum=0.1)
self.conv3 = nn.Conv2D(planes, planes * 4, kernel_size=1, bias_attr=False)
self.bn3 = BatchNorm2d(planes * 4, momentum=0.1)
self.relu = nn.ReLU()
self.downsample = downsample
self.stride = stride
self.dcn = dcn
self.with_dcn = dcn is not None
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
# out = self.conv2(out)
if not self.with_dcn:
out = self.conv2(out)
else:
offset = self.conv2_offset(out)
out = self.conv2(out, offset)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet(nn.Layer):
def __init__(self, block, layers, in_channels=3, dcn=None):
self.dcn = dcn
self.inplanes = 64
super(ResNet, self).__init__()
self.out_channels = []
self.conv1 = nn.Conv2D(
in_channels, 64, kernel_size=7, stride=2, padding=3, bias_attr=False
)
self.bn1 = BatchNorm2d(64, momentum=0.1)
self.relu = nn.ReLU()
self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dcn=dcn)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dcn=dcn)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dcn=dcn)
if self.dcn is not None:
for m in self.modules():
if isinstance(m, Bottleneck) or isinstance(m, BasicBlock):
if hasattr(m, "conv2_offset"):
constant_init(m.conv2_offset, 0)
def _make_layer(self, block, planes, blocks, stride=1, dcn=None):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2D(
self.inplanes,
planes * block.expansion,
kernel_size=1,
stride=stride,
bias_attr=False,
),
BatchNorm2d(planes * block.expansion, momentum=0.1),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample, dcn=dcn))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes, dcn=dcn))
self.out_channels.append(planes * block.expansion)
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x2 = self.layer1(x)
x3 = self.layer2(x2)
x4 = self.layer3(x3)
x5 = self.layer4(x4)
return x2, x3, x4, x5
def load_torch_params(paddle_model, torch_patams):
paddle_params = paddle_model.state_dict()
fc_names = ["classifier"]
for key, torch_value in torch_patams.items():
if "num_batches_tracked" in key:
continue
key = (
key.replace("running_var", "_variance")
.replace("running_mean", "_mean")
.replace("module.", "")
)
torch_value = torch_value.detach().cpu().numpy()
if key in paddle_params:
flag = [i in key for i in fc_names]
if any(flag) and "weight" in key: # ignore bias
new_shape = [1, 0] + list(range(2, torch_value.ndim))
print(
f"name: {key}, ori shape: {torch_value.shape}, new shape: {torch_value.transpose(new_shape).shape}"
)
torch_value = torch_value.transpose(new_shape)
paddle_params[key] = torch_value
else:
print(f"{key} not in paddle")
paddle_model.set_state_dict(paddle_params)
def load_models(model, model_name):
import torch.utils.model_zoo as model_zoo
torch_patams = model_zoo.load_url(model_urls[model_name])
load_torch_params(model, torch_patams)
def resnet18(pretrained=True, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
print("load from imagenet")
load_models(model, "resnet18")
return model
def deformable_resnet18(pretrained=True, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], dcn=dict(deformable_groups=1), **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
print("load from imagenet")
model.load_state_dict(model_zoo.load_url(model_urls["resnet18"]), strict=False)
return model
def resnet34(pretrained=True, **kwargs):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
model.load_state_dict(model_zoo.load_url(model_urls["resnet34"]), strict=False)
return model
def resnet50(pretrained=True, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
load_models(model, "resnet50")
return model
def deformable_resnet50(pretrained=True, **kwargs):
"""Constructs a ResNet-50 model with deformable conv.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], dcn=dict(deformable_groups=1), **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
model.load_state_dict(model_zoo.load_url(model_urls["resnet50"]), strict=False)
return model
def resnet101(pretrained=True, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
model.load_state_dict(model_zoo.load_url(model_urls["resnet101"]), strict=False)
return model
def resnet152(pretrained=True, **kwargs):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
model.load_state_dict(model_zoo.load_url(model_urls["resnet152"]), strict=False)
return model
if __name__ == "__main__":
x = paddle.zeros([2, 3, 640, 640])
net = resnet50(pretrained=True)
y = net(x)
for u in y:
print(u.shape)
print(net.out_channels)

@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/6 11:19
# @Author : zhoujun
from paddle import nn
class ConvBnRelu(nn.Layer):
def __init__(
self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
padding_mode="zeros",
inplace=True,
):
super().__init__()
self.conv = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias_attr=bias,
padding_mode=padding_mode,
)
self.bn = nn.BatchNorm2D(out_channels)
self.relu = nn.ReLU()
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.relu(x)
return x

@ -0,0 +1,132 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/4 14:54
# @Author : zhoujun
import paddle
from paddle import nn, ParamAttr
class DBHead(nn.Layer):
def __init__(self, in_channels, out_channels, k=50):
super().__init__()
self.k = k
self.binarize = nn.Sequential(
nn.Conv2D(
in_channels,
in_channels // 4,
3,
padding=1,
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
),
nn.BatchNorm2D(
in_channels // 4,
weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)),
bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4)),
),
nn.ReLU(),
nn.Conv2DTranspose(
in_channels // 4,
in_channels // 4,
2,
2,
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
),
nn.BatchNorm2D(
in_channels // 4,
weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)),
bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4)),
),
nn.ReLU(),
nn.Conv2DTranspose(
in_channels // 4, 1, 2, 2, weight_attr=nn.initializer.KaimingNormal()
),
nn.Sigmoid(),
)
self.thresh = self._init_thresh(in_channels)
def forward(self, x):
shrink_maps = self.binarize(x)
threshold_maps = self.thresh(x)
if self.training:
binary_maps = self.step_function(shrink_maps, threshold_maps)
y = paddle.concat((shrink_maps, threshold_maps, binary_maps), axis=1)
else:
y = paddle.concat((shrink_maps, threshold_maps), axis=1)
return y
def _init_thresh(self, inner_channels, serial=False, smooth=False, bias=False):
in_channels = inner_channels
if serial:
in_channels += 1
self.thresh = nn.Sequential(
nn.Conv2D(
in_channels,
inner_channels // 4,
3,
padding=1,
bias_attr=bias,
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
),
nn.BatchNorm2D(
inner_channels // 4,
weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)),
bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4)),
),
nn.ReLU(),
self._init_upsample(
inner_channels // 4, inner_channels // 4, smooth=smooth, bias=bias
),
nn.BatchNorm2D(
inner_channels // 4,
weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)),
bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4)),
),
nn.ReLU(),
self._init_upsample(inner_channels // 4, 1, smooth=smooth, bias=bias),
nn.Sigmoid(),
)
return self.thresh
def _init_upsample(self, in_channels, out_channels, smooth=False, bias=False):
if smooth:
inter_out_channels = out_channels
if out_channels == 1:
inter_out_channels = in_channels
module_list = [
nn.Upsample(scale_factor=2, mode="nearest"),
nn.Conv2D(
in_channels,
inter_out_channels,
3,
1,
1,
bias_attr=bias,
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
),
]
if out_channels == 1:
module_list.append(
nn.Conv2D(
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=1,
bias_attr=True,
weight_attr=ParamAttr(
initializer=nn.initializer.KaimingNormal()
),
)
)
return nn.Sequential(module_list)
else:
return nn.Conv2DTranspose(
in_channels,
out_channels,
2,
2,
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
)
def step_function(self, x, y):
return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y)))

@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
# @Time : 2020/6/5 11:35
# @Author : zhoujun
from .DBHead import DBHead
__all__ = ["build_head"]
support_head = ["DBHead"]
def build_head(head_name, **kwargs):
assert head_name in support_head, f"all support head is {support_head}"
head = eval(head_name)(**kwargs)
return head

@ -0,0 +1,50 @@
import paddle
from models.losses.basic_loss import BalanceCrossEntropyLoss, MaskL1Loss, DiceLoss
class DBLoss(paddle.nn.Layer):
def __init__(self, alpha=1.0, beta=10, ohem_ratio=3, reduction="mean", eps=1e-06):
"""
Implement PSE Loss.
:param alpha: binary_map loss 前面的系数
:param beta: threshold_map loss 前面的系数
:param ohem_ratio: OHEM的比例
:param reduction: 'mean' or 'sum' batch里的loss 算均值或求和
"""
super().__init__()
assert reduction in ["mean", "sum"], " reduction must in ['mean','sum']"
self.alpha = alpha
self.beta = beta
self.bce_loss = BalanceCrossEntropyLoss(negative_ratio=ohem_ratio)
self.dice_loss = DiceLoss(eps=eps)
self.l1_loss = MaskL1Loss(eps=eps)
self.ohem_ratio = ohem_ratio
self.reduction = reduction
def forward(self, pred, batch):
shrink_maps = pred[:, 0, :, :]
threshold_maps = pred[:, 1, :, :]
binary_maps = pred[:, 2, :, :]
loss_shrink_maps = self.bce_loss(
shrink_maps, batch["shrink_map"], batch["shrink_mask"]
)
loss_threshold_maps = self.l1_loss(
threshold_maps, batch["threshold_map"], batch["threshold_mask"]
)
metrics = dict(
loss_shrink_maps=loss_shrink_maps, loss_threshold_maps=loss_threshold_maps
)
if pred.shape[1] > 2:
loss_binary_maps = self.dice_loss(
binary_maps, batch["shrink_map"], batch["shrink_mask"]
)
metrics["loss_binary_maps"] = loss_binary_maps
loss_all = (
self.alpha * loss_shrink_maps
+ self.beta * loss_threshold_maps
+ loss_binary_maps
)
metrics["loss"] = loss_all
else:
metrics["loss"] = loss_shrink_maps
return metrics

@ -0,0 +1,16 @@
# -*- coding: utf-8 -*-
# @Time : 2020/6/5 11:36
# @Author : zhoujun
import copy
from .DB_loss import DBLoss
__all__ = ["build_loss"]
support_loss = ["DBLoss"]
def build_loss(config):
copy_config = copy.deepcopy(config)
loss_type = copy_config.pop("type")
assert loss_type in support_loss, f"all support loss is {support_loss}"
criterion = eval(loss_type)(**copy_config)
return criterion

@ -0,0 +1,101 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/4 14:39
# @Author : zhoujun
import paddle
import paddle.nn as nn
class BalanceCrossEntropyLoss(nn.Layer):
"""
Balanced cross entropy loss.
Shape:
- Input: :math:`(N, 1, H, W)`
- GT: :math:`(N, 1, H, W)`, same shape as the input
- Mask: :math:`(N, H, W)`, same spatial shape as the input
- Output: scalar.
"""
def __init__(self, negative_ratio=3.0, eps=1e-6):
super(BalanceCrossEntropyLoss, self).__init__()
self.negative_ratio = negative_ratio
self.eps = eps
def forward(
self,
pred: paddle.Tensor,
gt: paddle.Tensor,
mask: paddle.Tensor,
return_origin=False,
):
"""
Args:
pred: shape :math:`(N, 1, H, W)`, the prediction of network
gt: shape :math:`(N, 1, H, W)`, the target
mask: shape :math:`(N, H, W)`, the mask indicates positive regions
"""
positive = gt * mask
negative = (1 - gt) * mask
positive_count = int(positive.sum())
negative_count = min(
int(negative.sum()), int(positive_count * self.negative_ratio)
)
loss = nn.functional.binary_cross_entropy(pred, gt, reduction="none")
positive_loss = loss * positive
negative_loss = loss * negative
negative_loss, _ = negative_loss.reshape([-1]).topk(negative_count)
balance_loss = (positive_loss.sum() + negative_loss.sum()) / (
positive_count + negative_count + self.eps
)
if return_origin:
return balance_loss, loss
return balance_loss
class DiceLoss(nn.Layer):
"""
Loss function from https://arxiv.org/abs/1707.03237,
where iou computation is introduced heatmap manner to measure the
diversity between tow heatmaps.
"""
def __init__(self, eps=1e-6):
super(DiceLoss, self).__init__()
self.eps = eps
def forward(self, pred: paddle.Tensor, gt, mask, weights=None):
"""
pred: one or two heatmaps of shape (N, 1, H, W),
the losses of tow heatmaps are added together.
gt: (N, 1, H, W)
mask: (N, H, W)
"""
return self._compute(pred, gt, mask, weights)
def _compute(self, pred, gt, mask, weights):
if len(pred.shape) == 4:
pred = pred[:, 0, :, :]
gt = gt[:, 0, :, :]
assert pred.shape == gt.shape
assert pred.shape == mask.shape
if weights is not None:
assert weights.shape == mask.shape
mask = weights * mask
intersection = (pred * gt * mask).sum()
union = (pred * mask).sum() + (gt * mask).sum() + self.eps
loss = 1 - 2.0 * intersection / union
assert loss <= 1
return loss
class MaskL1Loss(nn.Layer):
def __init__(self, eps=1e-6):
super(MaskL1Loss, self).__init__()
self.eps = eps
def forward(self, pred: paddle.Tensor, gt, mask):
loss = (paddle.abs(pred - gt) * mask).sum() / (mask.sum() + self.eps)
return loss

@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:57
# @Author : zhoujun
from addict import Dict
from paddle import nn
import paddle.nn.functional as F
from models.backbone import build_backbone
from models.neck import build_neck
from models.head import build_head
class Model(nn.Layer):
def __init__(self, model_config: dict):
"""
PANnet
:param model_config: 模型配置
"""
super().__init__()
model_config = Dict(model_config)
backbone_type = model_config.backbone.pop("type")
neck_type = model_config.neck.pop("type")
head_type = model_config.head.pop("type")
self.backbone = build_backbone(backbone_type, **model_config.backbone)
self.neck = build_neck(
neck_type, in_channels=self.backbone.out_channels, **model_config.neck
)
self.head = build_head(
head_type, in_channels=self.neck.out_channels, **model_config.head
)
self.name = f"{backbone_type}_{neck_type}_{head_type}"
def forward(self, x):
_, _, H, W = x.shape
backbone_out = self.backbone(x)
neck_out = self.neck(backbone_out)
y = self.head(neck_out)
y = F.interpolate(y, size=(H, W), mode="bilinear", align_corners=True)
return y

@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-
# @Time : 2019/9/13 10:29
# @Author : zhoujun
import paddle
import paddle.nn.functional as F
from paddle import nn
from models.basic import ConvBnRelu
class FPN(nn.Layer):
def __init__(self, in_channels, inner_channels=256, **kwargs):
"""
:param in_channels: 基础网络输出的维度
:param kwargs:
"""
super().__init__()
inplace = True
self.conv_out = inner_channels
inner_channels = inner_channels // 4
# reduce layers
self.reduce_conv_c2 = ConvBnRelu(
in_channels[0], inner_channels, kernel_size=1, inplace=inplace
)
self.reduce_conv_c3 = ConvBnRelu(
in_channels[1], inner_channels, kernel_size=1, inplace=inplace
)
self.reduce_conv_c4 = ConvBnRelu(
in_channels[2], inner_channels, kernel_size=1, inplace=inplace
)
self.reduce_conv_c5 = ConvBnRelu(
in_channels[3], inner_channels, kernel_size=1, inplace=inplace
)
# Smooth layers
self.smooth_p4 = ConvBnRelu(
inner_channels, inner_channels, kernel_size=3, padding=1, inplace=inplace
)
self.smooth_p3 = ConvBnRelu(
inner_channels, inner_channels, kernel_size=3, padding=1, inplace=inplace
)
self.smooth_p2 = ConvBnRelu(
inner_channels, inner_channels, kernel_size=3, padding=1, inplace=inplace
)
self.conv = nn.Sequential(
nn.Conv2D(self.conv_out, self.conv_out, kernel_size=3, padding=1, stride=1),
nn.BatchNorm2D(self.conv_out),
nn.ReLU(),
)
self.out_channels = self.conv_out
def forward(self, x):
c2, c3, c4, c5 = x
# Top-down
p5 = self.reduce_conv_c5(c5)
p4 = self._upsample_add(p5, self.reduce_conv_c4(c4))
p4 = self.smooth_p4(p4)
p3 = self._upsample_add(p4, self.reduce_conv_c3(c3))
p3 = self.smooth_p3(p3)
p2 = self._upsample_add(p3, self.reduce_conv_c2(c2))
p2 = self.smooth_p2(p2)
x = self._upsample_cat(p2, p3, p4, p5)
x = self.conv(x)
return x
def _upsample_add(self, x, y):
return F.interpolate(x, size=y.shape[2:]) + y
def _upsample_cat(self, p2, p3, p4, p5):
h, w = p2.shape[2:]
p3 = F.interpolate(p3, size=(h, w))
p4 = F.interpolate(p4, size=(h, w))
p5 = F.interpolate(p5, size=(h, w))
return paddle.concat([p2, p3, p4, p5], axis=1)

@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
# @Time : 2020/6/5 11:34
# @Author : zhoujun
from .FPN import FPN
__all__ = ["build_neck"]
support_neck = ["FPN"]
def build_neck(neck_name, **kwargs):
assert neck_name in support_neck, f"all support neck is {support_neck}"
neck = eval(neck_name)(**kwargs)
return neck

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save