import requests
from queue import Queue
from lxml import etree
import random
import threading
import time
def iskong(a):
if len(a):
return a[0]
else:
return ''
class xiancheng(threading.Thread):
num = 0
def __init__(self,queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
if self.queue.qsize():
zhi = self.queue.get()
name = zhi[1]
qu = zhi[0]
# print(name)
self.get_content(name,qu)
else:
break
def get_content(self,name,qu):
pg = 0
while True:
time.sleep(random.choice([.2,.3]))
pg+=1
name1=name+'n'+str(pg)
print(name1)
self.header = {
'Cookie': 'baidu_OCPC_pc=175b3166f6576342e1b8da3cbe982e3ac0707ac33bd7a0eb0fc4ed58457fb267a%3A2%3A%7Bi%3A0%3Bs%3A13%3A%22baidu_OCPC_pc%22%3Bi%3A1%3Bs%3A178%3A%22%22https%3A%5C%2F%5C%2Fbj.5i5j.com%5C%2F%3Fpmf_group%3Dbaidu%26pmf_medium%3Dppzq%26pmf_plan%3D%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%26pmf_unit%3D%25E6%25A0%2587%25E9%25A2%2598%26pmf_keyword%3D%25E6%25A0%2587%25E9%25A2%2598%26pmf_account%3D160%22%22%3B%7D; yfx_c_g_u_id_10000001=_ck19050816090819925118455683784; yfx_mr_n_10000001=baidu%3A%3Amarket_type_ppzq%3A%3A%3A%3A%3A%3A%3A%3A%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3Abj.5i5j.com%3A%3A%3A%3A%3A%3A%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3A160%3A%3Apmf_from_adv%3A%3Abj.5i5j.com%2F; yfx_mr_f_n_10000001=baidu%3A%3Amarket_type_ppzq%3A%3A%3A%3A%3A%3A%3A%3A%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3Abj.5i5j.com%3A%3A%3A%3A%3A%3A%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3A160%3A%3Apmf_from_adv%3A%3Abj.5i5j.com%2F; yfx_key_10000001=; _ga=GA1.2.709004015.1557302949; _gid=GA1.2.1654710142.1557302949; _Jo0OQK=2935FC0765CCE38AE35A58A9AE4B3DE14B5E64633540174D1F3F365E66D27BFA97FB56F25D7FEEE909ECB864227DFFF9F6F83E96C00A59DFD84694DC03D740946B3C57212F12283777C840763663251ADEB840763663251ADEBC1E2E9C73736BBD19547054735CB82A9GJ1Z1OA==; zufang_BROWSES=500042751%2C42729693; PHPSESSID=rhm6leq3kmicdnc55ruv0h5jff; domain=bj; Hm_lvt_94ed3d23572054a86ed341d64b267ec6=1557323640,1557363313,1557363407,1557363838; yfx_f_l_v_t_10000001=f_t_1557302948887__r_t_1557363313214__v_t_1557365190840__r_c_1; Hm_lpvt_94ed3d23572054a86ed341d64b267ec6=' + str(
int(time.time())),
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
response3 = requests.get(name1, headers=self.header).content.decode('utf-8')
while True:
tree4 = etree.HTML(response3)
self.tiaozhuan = tree4.xpath('//div/text()')
if len(self.tiaozhuan)==0:
tiaozhuan1 = tree4.xpath('//script/text()')
chengqu = tiaozhuan1[0].split("'")
chengqu = chengqu[1]
response4 = requests.get(chengqu, headers=self.header).content.decode('utf-8')
tree5 = etree.HTML(response4)
if len(tree5.xpath('//div/text()'))==0:
response3 = requests.get(name1, headers=self.header).content.decode('utf-8')
continue
else:
break
else:
tree5 = tree4
break
#每一页的代码,马上开始爬取
fangwu = tree5.xpath('//ul[@class="pList"]/li')
if len(fangwu)==0:
print(qu,"已经爬取完成"+"一共"+str(pg)+'页')
break
# print(fangwu)
for i in fangwu:
#我们的链接
lianjie = 'https://bj.5i5j.com'+i.xpath('.//a/@href')[0]
# 图片
photo = i.xpath('.//img[@class="lazy"]/@src | .//img[@class="lazy"]/@data-src')
photo = iskong(photo)
#标题
title = i.xpath('.//img[@class="lazy"]/@title')
title = iskong(title)
# 建筑信息
weizhi = i.xpath('.//div[@class="listX"]/p/text()')
weizhi = weizhi[0].replace(" ","")
# 位置信息,可点击
dizhi = i.xpath('.//div[@class="listX"]/p/a/text()')
dizhi = iskong(dizhi)
#详细信息
chakan = i.xpath('.//div[@class="listX"]/p/text()')
if len(chakan)>1:
chakan=chakan[1]
else:
chakan="暂无信息"
#出租方式:
chuzufangshi = i.xpath('.//div[@class="jia"]/p/text()')
if len(chuzufangshi)>1:
chuzufangshi = chuzufangshi[1]
else:
chuzufangshi='暂无出租信息'
#价格
jiage = i.xpath('.//div[@class="jia"]/p//text()')
jiage = iskong(jiage)
xiancheng.num+=1
#标签
biaoqian = i.xpath('.//div[@class="listTag"]/span/text()')
if len(biaoqian) == 0:
biaoqian = "暂时无值"
else:
biaoqian = ' '.join(biaoqian)
print("正在爬取第", xiancheng.num, "次", "标题是:", title)
data={
"num":xiancheng.num,
'标题':title,
"详情页链接":lianjie,
"图片地址":photo,
"房间信息":weizhi,
"地址":dizhi,
"时间及其关注度":chakan,
"出租方式":chuzufangshi,
"价格":jiage+'元/月',
"标签":biaoqian
}
with open("我爱我家.txt", 'a') as fp:
fp.write(str(data) + '\n')
url = 'https://bj.5i5j.com/zufang/n2/'
# 它采用了随机跳转页面防止爬取,所以我们需要抓取里面的链接,抓取代码如下
response = requests.get(url).content.decode('utf-8')
tree = etree.HTML(response)
tiaozhuan = tree.xpath('//script/text()')[0]
zhuye = tiaozhuan.split("'")[1]
header = {
'Cookie': 'baidu_OCPC_pc=175b3166f6576342e1b8da3cbe982e3ac0707ac33bd7a0eb0fc4ed58457fb267a%3A2%3A%7Bi%3A0%3Bs%3A13%3A%22baidu_OCPC_pc%22%3Bi%3A1%3Bs%3A178%3A%22%22https%3A%5C%2F%5C%2Fbj.5i5j.com%5C%2F%3Fpmf_group%3Dbaidu%26pmf_medium%3Dppzq%26pmf_plan%3D%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%26pmf_unit%3D%25E6%25A0%2587%25E9%25A2%2598%26pmf_keyword%3D%25E6%25A0%2587%25E9%25A2%2598%26pmf_account%3D160%22%22%3B%7D; yfx_c_g_u_id_10000001=_ck19050816090819925118455683784; yfx_mr_n_10000001=baidu%3A%3Amarket_type_ppzq%3A%3A%3A%3A%3A%3A%3A%3A%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3Abj.5i5j.com%3A%3A%3A%3A%3A%3A%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3A160%3A%3Apmf_from_adv%3A%3Abj.5i5j.com%2F; yfx_mr_f_n_10000001=baidu%3A%3Amarket_type_ppzq%3A%3A%3A%3A%3A%3A%3A%3A%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3Abj.5i5j.com%3A%3A%3A%3A%3A%3A%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3A160%3A%3Apmf_from_adv%3A%3Abj.5i5j.com%2F; yfx_key_10000001=; _ga=GA1.2.709004015.1557302949; _gid=GA1.2.1654710142.1557302949; _Jo0OQK=2935FC0765CCE38AE35A58A9AE4B3DE14B5E64633540174D1F3F365E66D27BFA97FB56F25D7FEEE909ECB864227DFFF9F6F83E96C00A59DFD84694DC03D740946B3C57212F12283777C840763663251ADEB840763663251ADEBC1E2E9C73736BBD19547054735CB82A9GJ1Z1OA==; zufang_BROWSES=500042751%2C42729693; PHPSESSID=rhm6leq3kmicdnc55ruv0h5jff; domain=bj; Hm_lvt_94ed3d23572054a86ed341d64b267ec6=1557323640,1557363313,1557363407,1557363838; yfx_f_l_v_t_10000001=f_t_1557302948887__r_t_1557363313214__v_t_1557365190840__r_c_1; Hm_lpvt_94ed3d23572054a86ed341d64b267ec6=' + str(
int(time.time())),
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
# 进入主页后开始抓取我们需要的链接
response1 = requests.get(zhuye, headers=header).content.decode('utf-8')
tree1 = etree.HTML(response1)
# 抓取每个地区的链接
diqu = tree1.xpath('//ul[@class="new_di_tab sTab"]/a')[1:] # 排除第一个,所有的北京租房
print(len(diqu))
a = [] # 大列表,用来装我们需要的地点和名称
for i in diqu:
b = [] # 小列表,用来存储地区和链接相关联
diqu_a = 'https://bj.5i5j.com' + i.xpath('./@href')[0]
diqu_title = i.xpath('./@title')[0]
b.append(diqu_title)
b.append(diqu_a)
a.append(b)
# print("a标签",a)
# 建立队列:
queue = Queue()
craw_tread = []
start_time = time.time()
for i in a:
queue.put(i)
# 设置线程
xianchen_list = ['c1', 'c2','c3']
for craw in xianchen_list:
craw1 = xiancheng(queue)
craw1.start()
craw_tread.append(craw1)
for thread in craw_tread:
thread.join()
end_time = time.time()
print("最后时间是:", end_time - start_time)