python处理云锁实现多线程采集（beautiful soup）

最新推荐文章于 2024-01-18 21:26:23 发布

php-python-java-go-c

最新推荐文章于 2024-01-18 21:26:23 发布

阅读量174

点赞数

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/qq_28085257/article/details/120342014

版权

python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

1.破解云锁

security_verify_data

2.多线程采集

import json
import threading
import requests
from bs4 import BeautifulSoup


# 实现多线程
class myThread(threading.Thread):
    def __init__(self, table):
        threading.Thread.__init__(self)
        self.table = table

    def run(self):
        print("COLLECTING")
        collect(self.table)


# 采集方法
def collect(table):
    status = table.find("font").text
    # print(status)
    if status == '已发布':
        zh = {
            'province': table.find(width='10%').contents[0].contents[0].text,
            'name': table.find(width='34%').text.replace('已发布', '')
                .replace('(', '').replace(')', '').replace(" ", '')
                .replace("\r", "").replace("\n", ""),
            'link': table.find(width="12%").contents[0].contents[0].get('href'),
            'logo': table.find(width="12%").contents[0].contents[0].contents[0].get('src')
        }

        if zh['logo'] == "http://www.cnena.com/upload_files/":
            zh['logo'] = ""
        if zh['link']:
            detailUrl = '{}{}'.format(webUrl, zh['link'])
            # print(detailUrl)
            respDetail = requests.get(detailUrl, cookies=cookie)
            if respDetail.status_code == 200:
                # print(respDetail.encoding)
                # print(respDetail.apparent_encoding) 判断真实编码 供给下方使用
                respDetail.encoding = "GBK"
                detailHtml = respDetail.text
                soup1 = BeautifulSoup(detailHtml, features="html.parser")
                # print(detailHtml)
                p = soup1.select("div.area-sub > div:nth-child(1) > div:nth-child(2) > p")
                text = p[0].text
                textArr = text.split('\r\n')
                for text in textArr:
                    if "开幕日期" in text:
                        zh['start_time'] = text.replace("开幕日期：", "")

                    if "结束日期" in text:
                        zh['end_time'] = text.replace("结束日期：", "")

                    if "展会地点" in text:
                        zh['location'] = text.replace("展会地点：", "").replace("\n", "")

                p2 = soup1.select("div.area-sub > div:nth-child(2) > div:nth-child(2) > p")
                text = p2[0].text.replace(" ", '')
                zh['corporation'] = text

                paragraghs = soup1.select(".main-info")

                # print(paragraghs)
                # exit()

                content = ''
                for k, paragraph in enumerate(paragraghs):
                    if k > 0:
                        content += paragraph.text
                    zh['content'] = content
        json_zh = json.dumps(zh)
        with open('detail/' + zh['link'] + ".txt", 'w') as f:
            f.write(json_zh)


if __name__ == '__main__':
    page = 10
    url = "http://www.*****.com/showroom/listall-htm-ordertype-id-page-"
    webUrl = "http://www.*****.com/showroom/"
    resp = requests.get(url, timeout=5)
    cookie = {}
    for key, value in resp.cookies.items():
        cookie[key] = value

    # 云锁处理
    resp = requests.get(
        '{}{}'.format(url, '?security_verify_data=313932302c31303830'),
        cookies=cookie
    )

    for key, value in resp.cookies.items():
        cookie[key] = value

    # 注意这里的cookie字典中存有第一次请求得到的 yunsuo_session_verify，和第二次请求得到的 security_session_mid_verify
    for i in range(page, 1, -1):
        requestUrl = url + str(i) + ".html"
        resp = requests.get(
            requestUrl,
            cookies=cookie
        )
        if resp.status_code == 200:
            soup = BeautifulSoup(resp.text, features="html.parser")
            tables = soup.select("div.srpnel > table.gridtable")
            for p, table in enumerate(tables):
                if p > 0:
                    myThread(table).start()
        else:
            print(resp.status_code)
            print(resp.text)