最开始采用单线程,但是总共要爬取100页共计6000多个html数据,效率特别低,临时学习了一下线程池,非常好用
直接上代码:
import urllib.request
from lxml import etree
import threadpool
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# cookie参数
temp="sale_history_6346474=%257B%2522caseType%2522%3A%2522%257B%24caseType%257D%2522%2C%2522name%2522%3A%2522%25E5%259F%25B9%25E6%25A3%25AE%25E5%25A4%25A7%25E5%258E%25A6%2522%2C%2522useage%2522%3A%2522%25E4%25BD%258F%25E5%25AE%2585%2522%2C%2522price%2522%3A%2522%253Cem%253E330%253C/em%253E%25E4%25B8%2587%2522%2C%2522area%2522%3A%2522%253Cem%253E65.86%253C/em%253E%25E3%258E%25A1%2522%2C%2522room%2522%3A%25222%2522%2C%2522url%2522%3A%2522https%3A//sz.haofang.net/ershoufang/6346474_1.html%2522%257D;"
class HaoFang():
def __init__(self,url):
self.url=url
self.page_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Cookie":temp}
def get_data(self):
try:
req=urllib.request.Request(self.url,headers=self.page_headers)
data=urllib.request.urlopen(req).read().decode('utf-8','ignore')
code=urllib.request.urlopen(req).code
uri=urllib.request.urlopen(req).url
# print("===>%s"%code)
# print("--->%s"%uri)
# print("+++>%s"%data)
return code,uri,data
except Exception as e:
print(str(e))
return 0,0,0
def get_house(self):
code,uri,data=self.get_data()
if code==200:
html=etree.HTML(data)
# house_title=html.xpath('//div[@class="house-details"]/div[@class="house-title"]/a/@title')
#print(house_title)
house_url=html.xpath('//div[@class="info fr"]/div[@class="title"]/a/@href')
print(house_url)
for x in house_url:
print(x)
req = urllib.request.Request(x, headers=self.page_headers)
data = urllib.request.urlopen(req).read().decode('utf-8', 'ignore')
html = etree.HTML(data)
house_title = html.xpath('//body[@class="house-detail"]/div[@class=