补充可以自由选择省、市进行爬取数据 2020.6.28

最新推荐文章于 2021-06-23 22:34:51 发布
Newnotes
最新推荐文章于 2021-06-23 22:34:51 发布
阅读量7.7k
点赞数 1
本文链接：https://blog.csdn.net/newnotes/article/details/106993263
版权
博客介绍了如何在爬虫中实现自由选择省、市进行数据抓取，并补充了使用代理IP以提高爬取效率的方法。
摘要由CSDN通过智能技术生成
1、补充可以自由选择省、市进行爬取数据
2、补充使用代理IP访问
from tkinter import *  # 导入窗口控件
import requests
from lxml import etree
from tkinter import ttk
from bs4 import BeautifulSoup
import webbrowser  # 调用浏览器打开网页
from tkinter import messagebox  # 弹出提示框
from openpyxl import Workbook
import openpyxl
import time  # 延时
import random

global exitbiaozhi # 定义的是否退出FOR标志
treedata1 = []  # 全局变量用于存储查询到企业详细信息数据
# 读取行政代码。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
shengjidaima=''
shijidaima=''
a_dict= {
   }
a_list=[]
b_dict= {
   }
b_list=[]
filepath = '行政代码区划.xlsx'  # 打开已有excel数据文件
wb = openpyxl.load_workbook(filepath)
ws = wb['Sheet1']
# 表总行数
max_row = ws.max_row
# 表总列数
max_col = ws.max_column
ws2 = wb['Sheet2']
# 表总行数
max_row2 = ws2.max_row
# 表总列数
max_col2 = ws2.max_column

for x in range(1, max_row):
    cell_data = ws.cell(row=x, column=1).value #读取省级名称
    cell_id=ws.cell(row=x, column=2).value #读取省级代码
    a_dict.update({
   cell_data:cell_id}) #省级名称与代码保存入a_dict字典文件中
    a_list.append(cell_data) #省级名称保存入a_list列表文件中



def xFunc(event):
    #print(com.get())  # #获取选中的值方法1
    #print(xVariable.get())  # #获取选中的值方法2
    a_dict_str=a_dict[com.get()]
    a_dict_str=str(a_dict_str)[0:2]
    print(a_dict_str)
    # com2.delete(0,ttk.END)  #本意想清空COM列表中所有数据，但不能实现不知道为什么？？？？？
    for xx in range(1, max_row2):        #读取表2地级市代码表格内容
        cell_data2 = ws2.cell(row=xx, column=1).value
        cell_id2 = ws2.cell(row=xx, column=3).value
        cell_id2_str=str(cell_id2)[0:2]  #进行字符串截取得到在COM列表框中选择省级，与地市级进行关联
        if cell_id2_str == a_dict_str:
            b_dict.update({
   cell_data2: cell_id2})
            b_list.append(cell_data2)

    com2["value"] = (b_list)
    com2.current(1) #设置默认显示数据为第二条


def dierFunc(event):
    print(com.get())
    print(a_dict[com.get()])
    print(com2.get())
    print(b_dict[com2.get()])
    global shengjidaima,shijidaima
    shijidaima=b_dict[com2.get()]
    shengjidaima=a_dict[com.get()]
def huoqudaima():
    print(shengjidaima,shijidaima)

# 读取行政区划码结束。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。






def download_song():
    # 打开查询页面，得到页面总数....................................................

    http_ip = [
        '221.122.91.64:80',
        '163.125.65.90:9797',
        '58.251.230.5:9797',
        '118.69.50.154:80',
        '203.202.245.62:80',
        '60.191.11.251:3128',
        '54.241.121.74:3128',
        '58.220.95.54:9400',
        '52.179.231.206:80',
        '3.22.74.49:3128',
        '52.179.18.244:8080'
    ]
    proxy_ip = {
   
        'http': random.choice(http_ip),  # choice() 方法返回一个列表，元组或字符串的随机项。
    }
    print(proxy_ip)
    datas = {
   "page.pageNo": "1",
             "page.orderBy": "",
             "page.order": "",
             "province": shengjidaima,
             "city": shijidaima,
             "registerentername": "",
             "xkznum": "",
             "treadname": "",
             "treadcode": "",
             "publishtime": ""}
    url = "http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action"
    #r = requests.post("http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action", data=datas)
    r = requests.post(url, data=datas,proxies=proxy_ip)
    html = etree.HTML(r.text)
    urlpage = html.xpath('//div[@class="fr margin-t-33 margin-b-20"]/a/@onclick')[5]  # 找到HTML中总页数
    urlpageidstr = urlpage[21:23]  # 截取字符串，得到总页数
    print(urlpage)
    print(urlpageidstr)
    # 开始爬取所有页面数据
    start_page = 1
    # pagesum = urlpageidstr
    urlpageidstr = (int(urlpageidstr))
    # urlpageidstr = 2  # 调试数据暂定为2页，提高效率

    datasum = 1  # 记录爬取数据个数
    pagesum2 = urlpageidstr * 10
    messagebox.showinfo("提示", "数据正在读取请稍候。。。。")

    filepath = 'paiwuxuke.xlsx'  # 打开已有excel数据文件
    wb = openpyxl.load_workbook(filepath)
    ws = wb['Sheet']
    # 表总行数
    max_row = ws.max_row
    # 表总列数
    max_col = ws.max_column
    global exitbiaozhi

    enter1['state'] = 'readonly'  #输入延时输入框变不可修改数据

    for page in range(start_page, urlpageidstr):
        content = page
        # 打开查询页面，得到每个具体企业信息的链接....................................................
        urlpage = content
        proxy_ip = {
   
            'http': random.choice(http_ip),  # choice() 方法返回一个列表，元组或字符串的随机项。
        }
        print(proxy_ip)
        datas = {
   "page.pageNo": urlpage,
                 "page.orderBy": "",
                 "page.order": "",
                 "province": shengjidaima,
                 "city": shijidaima,
                 "registerentername": "",
                 "xkznum": "",
                 "treadname": "",
                 "treadcode": "",
                 "publishtime": ""}
        url = "http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action"
        #r = requests.post("http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action", data=datas)

        r = requests.post(url, data=datas,proxies=proxy_ip)
        html = etree.HTML(r.text)
        href_url = html.xpath('//table[@class="tabtd"]/tr/td/a/@href')
        href_name = html.xpath('//table[@class="tabtd"]/tr')[1:]
        urlpage = html.xpath('//div[@class="fr margin-t-33 margin-b-20"]/a/@onclick')[5]  # 找到HTML中总页数
        # urlpageidstr = urlpage[21:23]  # 截取字符串，得到总页数
        # print(urlpageidstr)
        i = 0

        # 打开查询页面，得到第N个企业的详细 信息....................................................
        for href_url, roos in zip(href_url, href_name):
            proxy_ip = {
   
                'http': random.choice(http_ip),  # choice() 方法返回一个列表，元组或字符串的随机项。
            }
            print(proxy_ip)
            addurl = href_url
            name = roos.xpath('./td[4]/text()')[0]
            i = i + 1
            addurl = addurl[39:93]
            datas = {
   "xkgk": "getxxgkContent",
                     "dataid": addurl}
            url = "http://permit.mee.gov.cn/permitExt/xkgkAction!xkgk.action?xkgk=" + addurl
            html = requests.get(url, headers=datas,proxies=proxy_ip)
            soup = BeautifulSoup(html.text, 'lxml')
            name_id = soup.find_all('p', style="font-size:36px;")[0].text  # 得到企业名称
            name_add = soup.find_all('p', style="font-weight: bold;color: green;font-size: 14px;")[
                0].text  # 得到企业地址等信息 ..strip() 属性删除空格
            content = name_add
            content = content.strip()  # 删除字符串左边空格
            content = content.split()  # 拆分字符串，通过指定分隔符对字符串进行分割，默认是空格。rstrip("\xa0\xa0\xa0\xa0\r\n\t\t\t")
            # content=content.partition(":")
            str2 = ''.join(content)
            u1, u2, u3, u4, u5 = str2.split('：', 4)
            f1 = u2.find('行业类别')
            f2 = u2[0: