关于爬取企业信息类的爬虫（一）

最新推荐文章于 2024-07-06 00:27:05 发布

法萌

最新推荐文章于 2024-07-06 00:27:05 发布

阅读量7.6k

点赞数 3

分类专栏：爬虫文章标签：爬虫 python3

本文链接：https://blog.csdn.net/qwdpoiguw/article/details/120471291

版权

爬虫专栏收录该内容

9 篇文章

订阅专栏

最近需要用到，根据营业执照来查询企业的名称和地址，首先想到的是企查查之类的网页版，在手动查询几十条之后，发现跳出了个账号登录的页面，无法继续查询，且网页每天每个IP的查询量有限制，遂想到了写个爬虫脚本，使用代理的方式来查。

一、urllib实现

依据fillder抓包，发现在请求qcc.com网址时（GET），会发送相关的6个cookie信息给服务器，之后由服务器返回2个cookie值（包括CDN节点acw_tc、以及企查查服务器QCCSESSID）。由于本人能力有限，无法查到GET请求中，cookie值是通过哪个JS来实现的，只好将浏览器正常访问的cookie值复制下来，所幸这些cookie值的有效期还是挺长的（至少半年的有效期）。

之后，就通过之前的文章：静态网页爬取，写代码来获取相关的数据。

#coding:utf-8
import urllib.request
import http.cookiejar
from lxml import etree
import socket

# 由于Accept-Encoding为gzip，需要解压
from io import BytesIO
import gzip

#设置超时时间为20s
#利用socket模块，使得每次重新下载的时间变短
socket.setdefaulttimeout(10)

header={ 'Host': 'www.qcc.com',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cookie': 'qcc_did=a18a6f68-1260-4022-a6a7-8dab19625aab; UM_distinctid=17c02f16b7d5a3-0f9813bd7a7761-581e311d-144000-17c02f16b7e4d1; _uab_collina=163213694895228480784965; CNZZDATA1254842228=88823110-1632136235-https%253A%252F%252Fwww.baidu.com%252F%7C1632362960; zg_did=%7B%22did%22%3A%20%2217c02f16ba7466-0272837b64f905-581e311d-144000-17c02f16ba84af%22%7D; zg_294c2ba1ecc244809c552f8f6fd2a440=%7B%22sid%22%3A%201632368833885%2C%22updated%22%3A%201632369586240%2C%22info%22%3A%201632136948658%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%22undefined%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%2C%22firstScreen%22%3A%201632368833885%7D',
}

def open_url(url:str, header:dict):
    request = urllib.request.Request(url=url, headers=header)
    try:
        response = opener.open(request)
    except:
        count = 1
        while count <= 5:
            try:
                response = opener.open(request)                                               
                break
            except:
                err_info = url+'\nReloading for %d time'%count if count == 1 else 'Reloading for %d times'%count
                print(err_info)
                count += 1
            else:
                return response
        if count > 5:
            print("connection fialed!")
    else:
        return response

def set_proxy(proxy:dict, use_proxy:bool):
    opener = None
    cjar = http.cookiejar.CookieJar()
    cookie = urllib.request.HTTPCookieProcessor(cjar)
    if use_proxy:
        proxy = urllib.request.ProxyHandler(proxy)
        opener = urllib.request.build_opener(proxy, cookie)
    else:
        opener = urllib.request.build_opener(cookie)
    urllib.request.install_opener(opener)
    # 获取平台返回的两个cookie信息(acw_tc、QCCSESSID)
    url="https://www.qcc.com"
    open_url(url, header)
    return opener
    
def get_one_addr(key:str):
    # 查询访问
    link = "https://www.qcc.com/search?key="+key
    print(link)
    response = open_url(link, header)
    html = response.read()
    
    buff = BytesIO(html)
    f = gzip.GzipFile(fileobj=buff)
    htmls = f.read().decode('utf-8')
    
    # 解析
    content = etree.HTML(htmls)
    addr = content.xpath('//div[@class="contact"]//ul[@class="list-unstyled"]//li[last()]//text()')[0].split("：")[-1]
    return addr, response.getcode()

def main_get_addr(proxy:dict, key:list, use_proxy=True):
    code = 200
    opener = set_proxy(proxy, use_proxy)
    address = []
    for k in key:
        if code != 200:
            # 获取新的proxy
            opener = set_proxy(proxy, use_proxy)
        addr, code = get_one_addr(k)
        address.append((k, addr))
    return address

proxy = {'https':"211.24.95.49:47615"}
key = ['923308*****', '92330******', '9233****', '9233***']

addr = main_get_addr(proxy, key, True)
print(addr)

问题：在多次变更代理IP之后，qcc.com返回了一个405错误页面，应该是服务器的反爬措施，猜测是cookie中有些字段的问题，但无法找到问题点，遂打算使用selenium来实现。

二、selenium实现

使用selenium的好处是，避免了cookie的问题，在使用时，参考之前的文章，selenium爬取网页，并借鉴了几篇别人的反爬措施，对相应字段进行设置，包括时区、地理位置、WebRTC、自动控制、超时重连等。

from selenium import webdriver
import requests
from lxml import etree
import socket
from queue import Queue
from selenium.webdriver.chrome.service import Service

#设置超时时间
#利用socket模块，使得每次重新下载的时间变短
socket.setdefaulttimeout(20)

addr_queue = Queue(maxsize=1000)

def main_get_addr(proxy:str, key_queue, use_proxy:bool):
    """proxy = 'http://27.192.200.7:9000'"""
    global addr_queue
    chrome_options = webdriver.ChromeOptions()
    # v78版本及以上chrome解决受自动控制软件的提示
    # https://www.cnblogs.com/dream66/p/12522252.html
    chrome_options.add_experimental_option( "excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    if use_proxy:
        print('using proxy: '+proxy)
        # 使用代理
        chrome_options.add_argument("--proxy-server={}".format(proxy))
        # 关闭WebRTC，避免代理被发现
        # https://mp.weixin.qq.com/s/zqW2cN2dNJoRjOfVsNUONA
        preferences = {
            "webrtc.ip_handling_policy": "disable_non_proxied_udp",
            "webrtc.multiple_routes_enabled": False,
            "webrtc.nonproxied_udp_enabled": False
        }
        chrome_options.add_experimental_option("prefs", preferences)
        
    # 告诉chrome去掉webdriver痕迹
    # https://mp.weixin.qq.com/s/U45x8HCPpNe-22LRtXYaNQ
    chrome_options.add_argument("disable-blink-features=AutomationControlled")

    # 浏览器驱动下载 https://blog.csdn.net/kenny_pj/article/details/103646745
    chrome_options.binary_location = r'F:\360极速浏览器\360Chrome\Chrome\Application\360chrome.exe'

    # driver.quit()无法关闭
    # https://blog.csdn.net/yangfengjueqi/article/details/84338167
    c_service = Service('./chromedriver.exe')
    c_service.command_line_args()
    c_service.start()
    
    driver = webdriver.Chrome('./chromedriver.exe', options=chrome_options)

    if use_proxy:
        # 设置时区和地理位置，避免代理被发现
        # https://mp.weixin.qq.com/s/bs4FHpWvh1VEhcylgcNUTA
        def get_timezone_geolocation(ip):
            import requests
            url = f"http://ip-api.com/json/{ip}"
            response = requests.get(url)
            return response.json()

        res_json = get_timezone_geolocation(proxy.split(':')[1][2:])
        print(res_json)
        geo = {
            "latitude": res_json["lat"],
            "longitude": res_json["lon"],
            "accuracy": 1
        }
        tz = {
            "timezoneId": res_json["timezone"]
        }
        driver.execute_cdp_cmd("Emulation.setGeolocationOverride", geo)
        driver.execute_cdp_cmd("Emulation.setTimezoneOverride", tz)

    retry_num = 2
    # 处理1：获取数据，超时重连
    try:
        driver.get('https://www.baidu.com')
        driver.get('https://www.qcc.com')
    except:
        count = 1
        while count <= retry_num:
            try:
                driver.get('https://www.baidu.com')
                driver.get('https://www.qcc.com')                                           
                break
            except:
                err_info = 'Reloading for %d time'%count if count == 1 else 'Reloading for %d times'%count
                print(err_info)
                count += 1
        if count > retry_num:
            print("connection fialed!")
            driver.quit() #关闭浏览器
            c_service.stop()
            return True

    while not key_queue.empty():
        key = key_queue.get()
        url = "https://www.qcc.com/web/search?key="+key
        # 处理1：获取数据，超时重连
        try:
            driver.get(url)
        except:
            count = 1
            while count <= retry_num:
                try:
                    driver.get(url)                                            
                    break
                except:
                    err_info = url+'\nReloading for %d time'%count if count == 1 else 'Reloading for %d times'%count
                    print(err_info)
                    count += 1
            if count > retry_num:
                print("connection fialed!")
                key_queue.put(key)# 因为未查询，添加到待查列表中
                driver.quit() #关闭浏览器
                c_service.stop()
                return True

        # 处理2：405页面
        if driver.current_url == 'http://diag.qichacha.com/405.html':
            print("405 page error!")
            key_queue.put(key)# 因为未查询，添加到待查列表中
            driver.quit() #关闭浏览器
            c_service.stop()
            return True

        # 处理3：页面内容，可能会跳转到登录页面
        content = etree.HTML(driver.page_source)
        addr  =content.xpath('//span[@class="val long-text"]//text()')[0]
        if addr == []:
            print("address is none!")
            key_queue.put(key)# 因为未查询，添加到待查列表中
            driver.quit() #关闭浏览器
            c_service.stop()
            return True
        addr_queue.put((key, addr))
    driver.quit() #关闭浏览器
    c_service.stop()
    return False


if __name__=="__main__":
    key_queue = Queue(maxsize=1000)
    for k in ['923302*****', '9233****', '9233021****', '92330***']:
        key_queue.put(k)

    for p in ['http://211.65.197.93:80', 'http://182.84.145.38:3256']:
        is_change_proxy = main_get_addr(proxy=p, key_queue=key_queue, use_proxy=True)

    while not addr_queue.empty():
        print(addr_queue.get())

问题：

1、似乎是我选的proxy有问题，使用代理的时候，都打不开网页；不使用代理的话，能顺利打开；

2、因代理问题，需要频繁的打开关闭浏览器，导致driver.quit()有时超时无响应，虽然跟使用了service的方式，但有时还会有问题；