python爬虫之安居客房价涨跌情况数据抓取
前言:这个是本小编第三次写文章,我想把我自身学到的爬虫技术和大家分享,并且大家有什么爬虫或者python上的问题可以在评论区或者私信我,我会尽力帮大家解决问题。
废话不多说,直接步入正题!今天给大家分享的是安居客的二手房价数据抓取:
1.首先加入相应的库文件:
import requests
from lxml import etree
import csv
import random
import queue
#下面的库是在当你被安居客官网给禁了的时候使用使用随机useragent库进行反反爬
from fake_useragent import UserAgent
2.爬取安居客的数据时用的头文件和ip代理池函数:
def source():
proxie = ['103.216.147.33:8080', '182.34.35.44:9999', '1.0.205.87:8080', '27.110.167.164:8080',
'45.115.175.112:57919', '36.91.188.18:8080', '223.100.166.3:36945', '202.118.164.12:8080',
'185.238.239.83:8090', '36.92.5.194:8089']
proxies_ip = random.choice(proxie)
proxies = {
'http': proxies_ip
}
bao = []
headers = {
'accept-language': 'zh-CN,zh;q=0.8',
'cookie': 'cna=Cyv5FqAiN1oCASeaC8G+vbAq; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; miid=1829095967314479518; cookie2=2c512579ab015453838fa2132fc3ef60; t=926ab1aa6ad9311b4e6cdc23c4941471; _tb_token_=3d4751545a05e; _m_h5_tk=230472b1e8aa7a8da84bf6e456645fbd_1602513978629; _m_h5_tk_enc=27e156111b8ae370b234604570dc48d2; xlly_s=1; _samesite_flag_=true; sgcookie=E100Flev9NM6U3WMmyPSkedu%2FnQiH3GHpfxY6WyxG5u%2BFCcEXzc1Z9v%2BcHjh3wJFFr5SlMEvsjqss0eVWeqrASQu7w%3D%3D; uc3=lg2=UIHiLt3xD8xYTw%3D%3D&nk2=F5RMHUA0MW7CsP4e&vt3=F8dCufHBw8o5mIMUduo%3D&id2=UUphy%2FeECc8cXlMdGA%3D%3D; csg=a0c2b18d; lgc=tb9440288677; dnk=tb9440288677; skt=08b38c23de6147f5; existShop=MTYwMjUwNTA4Mg%3D%3D; uc4=nk4=0%40FY4HWGpogENoxED%2BuCag%2BWZ7q1%2FTEBs%3D&id4=0%40U2grEJAXqvA5P0Z6volKBsw2OG0JjEOv; tracknick=tb9440288677; _cc_=VFC%2FuZ9ajQ%3D%3D; mt=ci=18_1; uc1=cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&cookie14=Uoe0b0C%2BTfcZWQ%3D%3D&cookie21=URm48syIYn73&existShop=false&pas=0; isg=BM3NGIu4riib4AtI63PrU4eo3OlHqgF8MwhCyA9SCWTTBu241_oRTBuUcJpg3Rk0; l=eBgd8CFIQfO7RutQBOfanurza77OSIRYYuPzaNbMiOCP_JfB5DSCWZ5J2nY6C3GVh62WR3RSn5QMBeYBqQAonxvOujZOzakmn; tfstk=cGEVBFDsgiIVmDm1JmiwlZ-aKsyAwDq3XQksoPPEL7s38xfmFCHlFC3rRXIoo',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
}
bao.append(headers)
bao.append(proxies)
return bao
3.准备一个csv文件用来存放数据:
def csv_create():
header = ['城市', '年份', '房屋价格', '房屋较上个月的涨跌情况']
with open('全国房价.csv', 'a', newline='') as f:
writer = csv.writer(f)
writer.writerow(header)
4.这时候就需要写相应的提取逻辑和抓取数据的函数了,首先我们写提取数据逻辑(本农农用的是xpath提取规则,也可以使用re正则或者BeautifulSoup进行数据的定位提取,想要本农农出xpath的提取规则和使用实战的小伙伴可以私信我哦):
def round_spider():
for i in range(2012,2022):
url = 'https://www.anjuke.com/fangjia/quanguo{}/'.format(i)
response = requests.get(url,headers=source()[0],proxies = source()[1])#.content.decode()
duilie = queue.Queue()
duilie.put(response.text)
fangjia = etree.HTML(duilie.get(response.text))
for j in range(1,51):
a = []
fangjia_detail = fangjia.xpath('/html/body/div[2]/div[4]/div[1]/div[1]/ul/li[{}]/a/@href'.format(j))
fangjia_diqu = fangjia.xpath('/html/body/div[2]/div[4]/div[1]/div[1]/ul/li[{}]/a/b/text()'.format(j))
headers = {
'accept-language': 'zh-CN,zh;q=0.8',
'cookie': 'cna=Cyv5FqAiN1oCASeaC8G+vbAq; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; miid=1829095967314479518; cookie2=2c512579ab015453838fa2132fc3ef60; t=926ab1aa6ad9311b4e6cdc23c4941471; _tb_token_=3d4751545a05e; _m_h5_tk=230472b1e8aa7a8da84bf6e456645fbd_1602513978629; _m_h5_tk_enc=27e156111b8ae370b234604570dc48d2; xlly_s=1; _samesite_flag_=true; sgcookie=E100Flev9NM6U3WMmyPSkedu%2FnQiH3GHpfxY6WyxG5u%2BFCcEXzc1Z9v%2BcHjh3wJFFr5SlMEvsjqss0eVWeqrASQu7w%3D%3D; uc3=lg2=UIHiLt3xD8xYTw%3D%3D&nk2=F5RMHUA0MW7CsP4e&vt3=F8dCufHBw8o5mIMUduo%3D&id2=UUphy%2FeECc8cXlMdGA%3D%3D; csg=a0c2b18d; lgc=tb9440288677; dnk=tb9440288677; skt=08b38c23de6147f5; existShop=MTYwMjUwNTA4Mg%3D%3D; uc4=nk4=0%40FY4HWGpogENoxED%2BuCag%2BWZ7q1%2FTEBs%3D&id4=0%40U2grEJAXqvA5P0Z6volKBsw2OG0JjEOv; tracknick=tb9440288677; _cc_=VFC%2FuZ9ajQ%3D%3D; mt=ci=18_1; uc1=cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&cookie14=Uoe0b0C%2BTfcZWQ%3D%3D&cookie21=URm48syIYn73&existShop=false&pas=0; isg=BM3NGIu4riib4AtI63PrU4eo3OlHqgF8MwhCyA9SCWTTBu241_oRTBuUcJpg3Rk0; l=eBgd8CFIQfO7RutQBOfanurza77OSIRYYuPzaNbMiOCP_JfB5DSCWZ5J2nY6C3GVh62WR3RSn5QMBeYBqQAonxvOujZOzakmn; tfstk=cGEVBFDsgiIVmDm1JmiwlZ-aKsyAwDq3XQksoPPEL7s38xfmFCHlFC3rRXIoo',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
}
response1 = requests.get(fangjia_detail[0],headers=headers)
fangjia_yue = etree.HTML(response1.text)
for k in range(1,13):
fangjia_yue_detail_price = fangjia_yue.xpath('/html/body/div[2]/div[5]/div[1]/div[1]/ul/li[{}]/a/b/text()'.format(k))
fangjia_yue_detail_price_jiage = fangjia_yue.xpath('/html/body/div[2]/div[5]/div[1]/div[1]/ul/li[{}]/a/span/text()'.format(k))
fangjia_yue_detail_price_jiage_zhangfu = fangjia_yue.xpath('/html/body/div[2]/div[5]/div[1]/div[1]/ul/li[{}]/a/em/text()'.format(k))
if fangjia_yue_detail_price != a:
print(fangjia_diqu[0],fangjia_yue_detail_price[0],fangjia_yue_detail_price_jiage[0],fangjia_yue_detail_price_jiage_zhangfu[0])
with open('全国房价.csv','a',newline='') as f:
f.write(fangjia_diqu[0] + ',')
f.write(fangjia_yue_detail_price[0] + ',')
f.write(fangjia_yue_detail_price_jiage[0] + ',')
f.write(fangjia_yue_detail_price_jiage_zhangfu[0] + '\n')
5.接着写相应的主函数:
if __name__ == '__main__':
csv_create()
source()
round_spider()