1.破解云锁
security_verify_data
2.多线程采集
import json
import threading
import requests
from bs4 import BeautifulSoup
# 实现多线程
class myThread(threading.Thread):
def __init__(self, table):
threading.Thread.__init__(self)
self.table = table
def run(self):
print("COLLECTING")
collect(self.table)
# 采集方法
def collect(table):
status = table.find("font").text
# print(status)
if status == '已发布':
zh = {
'province': table.find(width='10%').contents[0].contents[0].text,
'name': table.find(width='34%').text.replace('已发布', '')
.replace('(', '').replace(')', '').replace(" ", '')
.replace("\r", "").replace("\n", ""),
'link': table.find(width="12%").contents[0].contents[0].get('href'),
'logo': table.find(width="12%").contents[0].contents[0].contents[0].get('src')
}
if zh['logo'] == "http://www.cnena.com/upload_files/":
zh['logo'] = ""
if zh['link']:
detailUrl = '{}{}'.format(webUrl, zh['link'])
# print(detailUrl)
respDetail = requests.get(detailUrl, cookies=cookie)
if respDetail.status_code == 200:
# print(respDetail.encoding)
# print(respDetail.apparent_encoding) 判断真实编码 供给下方使用
respDetail.encoding = "GBK"
detailHtml = respDetail.text
soup1 = BeautifulSoup(detailHtml, features="html.parser")
# print(detailHtml)
p = soup1.select("div.area-sub > div:nth-child(1) > div:nth-child(2) > p")
text = p[0].text
textArr = text.split('\r\n')
for text in textArr:
if "开幕日期" in text:
zh['start_time'] = text.replace("开幕日期:", "")
if "结束日期" in text:
zh['end_time'] = text.replace("结束日期:", "")
if "展会地点" in text:
zh['location'] = text.replace("展会地点:", "").replace("\n", "")
p2 = soup1.select("div.area-sub > div:nth-child(2) > div:nth-child(2) > p")
text = p2[0].text.replace(" ", '')
zh['corporation'] = text
paragraghs = soup1.select(".main-info")
# print(paragraghs)
# exit()
content = ''
for k, paragraph in enumerate(paragraghs):
if k > 0:
content += paragraph.text
zh['content'] = content
json_zh = json.dumps(zh)
with open('detail/' + zh['link'] + ".txt", 'w') as f:
f.write(json_zh)
if __name__ == '__main__':
page = 10
url = "http://www.*****.com/showroom/listall-htm-ordertype-id-page-"
webUrl = "http://www.*****.com/showroom/"
resp = requests.get(url, timeout=5)
cookie = {}
for key, value in resp.cookies.items():
cookie[key] = value
# 云锁处理
resp = requests.get(
'{}{}'.format(url, '?security_verify_data=313932302c31303830'),
cookies=cookie
)
for key, value in resp.cookies.items():
cookie[key] = value
# 注意这里的cookie字典中存有第一次请求得到的 yunsuo_session_verify,和第二次请求得到的 security_session_mid_verify
for i in range(page, 1, -1):
requestUrl = url + str(i) + ".html"
resp = requests.get(
requestUrl,
cookies=cookie
)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, features="html.parser")
tables = soup.select("div.srpnel > table.gridtable")
for p, table in enumerate(tables):
if p > 0:
myThread(table).start()
else:
print(resp.status_code)
print(resp.text)