链家网 爬取
如何查看头文件 看200
from lxml import etree
import requests
import csv
import time
# 定义爬取和解析数据的函数
#为了防止被服务器反爬虫禁止 所以定义头部
def spider():
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
pre_url='https://cc.lianjia.com/ershoufang/pg'
#防止爬取速度过快
i=0
for x in range(1,11):
html=requests.get(pre_url+str(x),headers=headers)
html.encoding = 'utf-8'
time.sleep(5)
selector=etree.HTML(html.text)
house_list=selector.xpath('//*[@id="content"]/div[1]/ul/li')
for house in house_list:
#apartment=house.xpath('div[2]/div[3]/div/a/text()')[0]
apartment=house.xpath('div[1]/div[2]/div/a[1]/text()')[0]
house_more_info=house.xpath('div[1]/div[3]/div/text()')[0]
#print house_more_info 3室1厅 | 163.85平米 | 西南 | 精装 | 高楼层(共32层) | 2003年建 | 塔楼
house_more_info_split=house_more_info.split(' | ') # 0是
house_layout=house_more_info_split[0]
area=house_more_info_split[1]
region=house_more_info_split[2]
price=house.xpath('div[1]/div[6]/div[1]/span/text()')[0]+'万'
item=[apartment,house_layout,area,region,price]
data_write(item)
apartment=house.xpath('div[1]/div[1]/a/text()')
i=i+1
print(i,'正在抓取',apartment)
def data_write(item):
#防止添加空行 newline
with open('ljian_ershoufang.csv','w',encoding='utf-8',newline='')as csvfile:
writer=csv.writer(csvfile)
writer.writerow(item)
spider()