|
|
import easyocr
|
|
|
import os
|
|
|
import re
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
class id_card_ocr():
|
|
|
|
|
|
def __init__(self): # 文件位置
|
|
|
self.images = r'D:/id_card' # 需要注意的是,图片文件的名称不能有汉字,否则会报错~
|
|
|
|
|
|
def ocr_reader(self): # 创建ocr对象,识别中英文
|
|
|
ocr = easyocr.Reader(['ch_sim', 'en'], gpu=True)
|
|
|
return ocr
|
|
|
|
|
|
def read_content(self): # 识别图片文字,并遍历
|
|
|
data = []
|
|
|
for image in os.listdir(self.images):
|
|
|
content = self.ocr_reader().readtext(f'{self.images}/{image}', detail=0)
|
|
|
content = ''.join(content) # 列表转换为纯文本
|
|
|
new_content = content.replace(" ", "") # 去除掉空格内容
|
|
|
print(f'正在识别:{image}')
|
|
|
name = re.findall(r'名(.*?)性', new_content)
|
|
|
gender = re.findall(r'别(.*?)民族|民', new_content)
|
|
|
nation = re.findall(r'族|民族(.*?)出', new_content)
|
|
|
address = re.findall(r'址(.*?)公', new_content)
|
|
|
number = re.findall(r'身份号码(\d+)', new_content)
|
|
|
|
|
|
new_name = ''.join(name)
|
|
|
new_gender = ''.join(gender)
|
|
|
new_nation = ''.join(nation)
|
|
|
new_address = ''.join(address)
|
|
|
new_number = ''.join(number)
|
|
|
if len(new_number) == 18: # 判断身份证的位数
|
|
|
pass
|
|
|
elif len(new_number) == 17:
|
|
|
new_number = new_number + "X"
|
|
|
print(f'完成识别:{image}')
|
|
|
data.append([new_name, new_gender, new_nation, new_address, new_number])
|
|
|
print(data)
|
|
|
return data
|
|
|
|
|
|
def read_to_excel(self):
|
|
|
df = pd.DataFrame(self.read_content(), columns=['姓名', '性别', '民族', '地址', '身份证号码'])
|
|
|
print(f'识别结果如下:')
|
|
|
print(df)
|
|
|
df.to_excel(r'D:/id_card/识别结果.xlsx', index=False)
|
|
|
return df
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
info = id_card_ocr()
|
|
|
info.read_content()
|
|
|
info.read_to_excel() |