爬取链家项目需要用到代理池 ,代理池具体怎么设置可以去翻我之前的文章
import hashlib
import requests
from lxml import etree
import pymongo
import time,re
class Lianjia(object):
def __init__(self,url):
self.url = url
self.proxies = self.get_proxies()
self.client = pymongo.MongoClient(host='localhost', port=27017)
self.db = self.client['lianjia']
self.main()
def get_proxies(self):
try:
response = requests.get('http://localhost:5000/get')
proxies = {
'http': 'http://' + response.text
}
return proxies
except Exception:
return None
def get_xpath_by_requests(self,url,proxies):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Referer': 'https://bj.lianjia.com/?utm_source=baidu&utm_medium=pinzhuan&utm_term=biaoti&utm_content=biaotimiaoshu&utm_campaign=sousuo&ljref=pc_sem_baidu_ppzq_x',
}
try:
response = requests.get(url, headers=headers, proxies=proxies)
return etree.HTML(response.text)
except Exception:
proxies_new = self.get_proxies()
print('重新获取代理',proxies)
return self.get_xpath_by_requests(url,proxies_new)
def get_text(self,text):
if text:
return text[0]
return ''
def get_md5(self,value):
md5 = hashlib.md5(bytes(value,encoding='utf-8'))
return md5.hexdigest()
def write_to_mongo(self,item):
item['hash_url'] = self.get_md5(item['detail_url'])
self.db['beijing'].update({'hash_url':item['hash_url']},{'$set':item},True)
def parse_page(self,div_list):
for div in div_list:
title =self.get_text(div.xpath('.//p[@class="content__list--item--title twoline"]/a/text()')).strip()
price = self.get_text(div.xpath('.//span[@class="content__list--item-price"]/em/text()'))
detail_url = self.get_text(div.xpath('.//p[@class="content__list--item--title twoline"]/a/@href'))
item = {}
item['title'] = title
item['price'] = price
item['detail_url'] = detail_url
self.write_to_mongo(item)
def parse_area(self,url):
html = self.get_xpath_by_requests(url, self.proxies)
i=1
while True:
page_url = url+'pg{}'.format(i)
html = self.get_xpath_by_requests(page_url, self.proxies)
div_list = html.xpath('//div[@class="content__list"]/div')
if not div_list:
break
self.parse_page(div_list)
i+=1
def main(self,):
html = self.get_xpath_by_requests(self.url,self.proxies)
areas = html.xpath('//div[@id="filter"]/ul[2]/li[position()>1]/a/@href')
for area in areas:
area_url = 'https://bj.lianjia.com'+area
self.parse_area(area_url)
if __name__ == '__main__':
base_url = 'https://bj.lianjia.com/zufang/'
Lianjia(base_url)