python爬虫之验证码

对于验证码，有太多的形式和复杂度，需要对特定验证码对对应的处理，才可能识别。这里主要介绍pytesseract的使用。

Tesseract

Tesseract 是一个 OCR 库,目前由 Google 赞助(Google 也是一家以 OCR 和机器学习技术闻名于世的公司)。Tesseract 是目前公认最优秀、最精确的开源 OCR 系统。除了极高的精确度,Tesseract 也具有很高的灵活性。它可以通过训练识别出任何字体，也可以识别出任何 Unicode 字符。而pytesseract是对tesseract的一层python封装

安装

pytesseract可以直接用pip安装，也可以下载源码包编译安装，地址：https://pypi.python.org/pypi/pytesseract，由于pytesseract会调用tesseract，所以还需要安装tesseract tesseract-ocr安装也比较简单，官方文档在此，自己看：https://github.com/tesseract-ocr/tesseract/wiki 语言库下载地址：https://github.com/tesseract-ocr/tessdata

使用

直接来个例子：如果是纯数字，用get_code1还不错，如果有背景干扰，用get_code2，如果有很严重的干扰线，用get_code3试试，不保证成功率。感觉只是pytesseract还是不行，需要通过训练来提高准确率，下次研究。

from PIL import Image
import pytesseract
import os

    
def get_code1(file_path):
    p = Image.open(file_path)
    code = pytesseract.image_to_string(p)
    return code

def get_code2(file_path):
    # 打开图片
    p = Image.open(file_path)
    # 转化到灰度图
    p = p.convert('L')
    # 二值化
    threshold = 140
    table = []
    for i in range(256):
        if i < threshold:
            table.append(0)
        else:
            table.append(1)
    p = p.point(table, '1')
    # 去除边框
    pixdata = p.load()
    for x in range(p.size[0]):
        pixdata[x, 0] = pixdata[x, p.size[1] - 1] = 255
    for y in range(p.size[1]):
        pixdata[0, y] = pixdata[p.size[0] - 1, y] = 255
    #p.show()
    # 识别
    code = pytesseract.image_to_string(p)
    return code

def get_code3(file_path):
    p = Image.open(file_path)
    p = p.convert('L')
    # 二值化
    pixdata = p.load()
    w, h = p.size
    threshold = 140
    for y in range(h):
        for x in range(w):
            if pixdata[x, y] < threshold:
                pixdata[x, y] = 0
            else:
                pixdata[x, y] = 255
    # 去除边框
    for x in range(p.size[0]):
        pixdata[x, 0] = pixdata[x, p.size[1] - 1] = 255
    for y in range(p.size[1]):
        pixdata[0, y] = pixdata[p.size[0] - 1, y] = 255
    p.show()
    #去除干扰项
    for y in range(1,h-1):
        for x in range(1,w-1):
            count = 0
            if pixdata[x,y-1] > 245:#上
                count = count + 1
            if pixdata[x,y+1] > 245:#下
                count = count + 1
            if pixdata[x-1,y] > 245:#左
                count = count + 1
            if pixdata[x+1,y] > 245:#右
                count = count + 1
            if pixdata[x-1,y-1] > 245:#左上
                count = count + 1
            if pixdata[x-1,y+1] > 245:#左下
                count = count + 1
            if pixdata[x+1,y-1] > 245:#右上
                count = count + 1
            if pixdata[x+1,y+1] > 245:#右下
                count = count + 1
            if count > 4:
                pixdata[x,y] = 255
    #p.show()
    code = pytesseract.image_to_string(p)
    return code

def main():
    path = "/xxx/yzm/"
    files = os.listdir(path)
    for file in files:
        if file[0] == '.':
            pass
        else:
            file_path = path + file
            print(file + '的验证码是' + get_code1(file_path))
            print(file + '的验证码是' + get_code2(file_path))
            print(file + '的验证码是' + get_code3(file_path))
            print("=======================")
            
main()

Tesseract#

安装#

使用#

Tesseract

安装

使用