python从零到一爬坑之路

最新推荐文章于 2023-03-27 17:21:11 发布

编程迪

最新推荐文章于 2023-03-27 17:21:11 发布

阅读量197

点赞数

文章标签： python 爬虫

本文链接：https://blog.csdn.net/zd1007129657/article/details/102664140

版权

python从️到️0到1历时两天，成功拿下，政府的网站也不过如此嘛！在这之前真的连python怎么写都不知道，一开始还记得是paython，作为一个前端工程师，真不敢相信自己竟然拿到了数据，稍微有点激动，真的是一路百度一路查，感谢新同事和老同学的指导，特此记录。具体操作流程如下：

Python环境安装

http://fastsoft.onlinedown.net/down/python-3.7.4.exe
注意： win10系统下安装请勾选 Add Python 3.6 to PATH，否则会提示python不是内部命令

安装request模块

pip install requests（pip是python自带的命令直接执行即可）

安装xlrd和xlwt（用于获取和写入excel表格数据）

pin install xlrd（阅读excel）
pip install xlwt（写入excel）

安装BeautifulSoup4（用于网页解析）

pip install beautifulsoup4
pip install lxml（Beautiful soup需要lxml包对文件进行处理，所以在安装完bs4之后你还需要安装lxml包）

代码分享

import requests
import json     # 用于解析数据
import base64   # 用于图片转码

import xlrd     # 读取Excel的数据
edata = xlrd.open_workbook('table.xls')   # 读取xls文件
table = edata.sheets()[0]    # 根据索引获取sheet
#table = edata.sheet_by_name(u'Sheet1')   # 根据名称获取sheet
nrows = table.nrows   # 获取行数
ncols = table.ncols   # 获取列数

import xlwt     # 将数据存入本地Excel
workbook = xlwt.Workbook(encoding = 'utf-8')    # 创建一个workbook
worksheet = workbook.add_sheet('My Worksheet',cell_overwrite_ok=True)

from bs4 import BeautifulSoup   # 用于解析html

browser = requests.session()    # 构建session 建立请求头

#list = ["91440300MA5ELXPP71","914403003120937795","914403003197446777","91440300359597114T"]
#list = ["91440300MA5ELXPP71"]

def downimage(i,no):
    headers = {
        "Referer":"https://amr.sz.gov.cn/szjxjgw/pubinfo/pubinfoAction!index.dhtml?type=mobile",
        'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    }
    # 请求官网地址
    url = 'https://amr.sz.gov.cn/szjxjgw/pubinfo/pubinfoAction!index.dhtml?type=mobile'
    r = browser.get(url,headers=headers)
    #print(r.cookies)

    # 获取验证码图片
    url = "https://amr.sz.gov.cn/szjxjgw/checkCodeServlet?length=6&width=80"
    image = browser.get(url,headers=headers).content
    print (i)

    # 将图片保存到本地
    with open("codeImg/"+str(i)+"image.jpg","wb") as f:    # wb只写
        f.write(image)

    # 将图片转为base64编码
    with open("codeImg/"+str(i)+"image.jpg", 'rb') as f:   # rb只读
        base64_data = base64.b64encode(f.read())
        base64_img = base64_data.decode()
        #print(base64_img)

    # 调用百度文字识别接口
    headers2={
        "Referer":"https://amr.sz.gov.cn/szjxjgw/pubinfo/pubinfoAction!index.dhtml?type=mobile",
        'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
        "Content-Type":"application/x-www-form-urlencoded"
    }
    data={
        'image':base64_img
    }
    url="https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic?access_token=******"
    session=requests.session()
    res=requests.post(url,data=data,headers=headers2)
    imgStr=json.loads(res.text)
    imgCode = imgStr['words_result'][-1]['words'].replace(' ', '')
    print(imgCode)

    # 校验验证码 获取公司信息
    Url_2 = 'https://amr.sz.gov.cn/szjxjgw/pubinfo/pubinfoAjaxAction!queryEntinfo.dhtml'
    form = {
        "logincode":imgCode,
        "pubinfo.regno":no
    }
    resp = browser.post(Url_2, headers = headers,data=form)
    a = resp.content.decode('utf8')
    aJson = json.loads(a)
    print(aJson)
    status = aJson['status']    # 拿到状态 进行判断 如果为false 可能是二维码识别错误 回调当前函数 重新获取并识别
    if(status==False):
        downimage(i,no)
    else:
        obj = aJson['obj']
        entname = obj['entname']
        print(entname)

        # 如果拿到公司信息 传入相关参数 判断是否被抽查
        form2 = {
            "logincode":imgCode,
            "pubinfo.regno":no,
            "pubinfo.entname":entname
        }
        Url_3 = 'https://amr.sz.gov.cn/szjxjgw/pubinfo/pubinfoAction!queryTaskinfo.dhtml?type=mobile'
        resp = browser.post(Url_3, headers = headers2,data=form2)
        html = resp.content.decode('utf8')
        # print(html)
        soup = BeautifulSoup(html,'lxml')   # 定义解析对象
        h5s = soup.find_all('h5')   # 拿到所有的h5标签
        info = h5s[1].string    # 选取第二个标签 也就是我们所需要的信息
        print (info)

        # 创建表格 并将数据塞入其中
        head = ['验证码','信用代码','公司名称','信息']
        for h in range(len(head)):
            worksheet.write(0,h,head[h])
        worksheet.write(i,0,imgCode)
        worksheet.write(i,1,no)
        worksheet.write(i,2,entname)
        worksheet.write(i,3,info)
        workbook.save('shuju.xls')
        return True

#if __name__=="__main__":
#    for i in range(len(list)):
#        downimage(i)

# for i in range(nrows):
#     #for j in range(ncols):
#         if(i>0):
#             content = table.cell(i,1).value     # 获取数据
#             #downimage(i,content)

import threading    # python的轮询Timer
def say_hello(i):
    #print (i)
    timer_online = threading.Timer(30, say_hello,(i+1,))
    timer_online.start()
    content = table.cell(int(i),0).value     # 获取数据
    #print (downimage(i,content))
    if (content!=''):
        #print (content)
        downimage(i,content)

say_hello(1)