分析思路
- 网页每次链家翻页的时候会有pg参数跟着,第一页就是pg1第二页就是pg2
- 每个网页对应的数据没有进行加密处理,直接用requsets请求网页
- 用xpath对网页进行内容提取
- 如果对一个网页的数据爬取完毕那么就用线程池进行多线程爬取
代码实现
re = requests.get(url) # 请求url
html = etree.HTML(re.text) # 解析
li = html.xpath('//*[@id="content"]/div[1]/ul/li') # 提取li标签
for i in li:
div = i.xpath('./div[1]/div')
title = str(div[0].xpath('./a/text()')[0])
address = ''.join(map(str, div[1].xpath('./div[1]/a/text()')))
houseInfo = str(div[2].xpath('./div/text()')[0]).replace('|', '_')
totalPrice, unitPrice = div[5].xpath('./div//span/text()')
get_list = [title, address, houseInfo, str(totalPrice), str(unitPrice)]
csv_writer.writerow(get_list)
def get_page(url):
re = requests.get(url) # 请求url
html = etree.HTML(re.text) # 解析
li = html.xpath('//*[@id="content"]/div[1]/ul/li') # 提取li标签
for i in li:
div = i.xpath('./div[1]/div')
title = str(div[0].xpath('./a/text()')[0])
address = ''.join(map(str, div[1].xpath('./div[1]/a/text()')))
houseInfo = str(div[2].xpath('./div/text()')[0]).replace('|', '_')
totalPrice, unitPrice = div[5].xpath('./div//span/text()')
get_list = [title, address, houseInfo, str(totalPrice), str(unitPrice)]
csv_writer.writerow(get_list)
print(url, 'success')
with open('data.csv', mode='w', encoding='utf-8', newline='') as f:
csv_writer = csv.writer(f)
with ThreadPoolExecutor(50) as t:
for item in range(1, 101):
t.submit(get_page, url=f"https://nanchong.lianjia.com/ershoufang/pg{item}rs%E5%8D%97%E5%85%85/")
import requests
from lxml import etree
import csv
# 导入线程池
from concurrent.futures import ThreadPoolExecutor
def get_page(url):
re = requests.get(url) # 请求url
html = etree.HTML(re.text) # 解析
li = html.xpath('//*[@id="content"]/div[1]/ul/li') # 提取li标签
for i in li:
div = i.xpath('./div[1]/div')
title = str(div[0].xpath('./a/text()')[0])
address = ''.join(map(str, div[1].xpath('./div[1]/a/text()')))
houseInfo = str(div[2].xpath('./div/text()')[0]).replace('|', '_')
totalPrice, unitPrice = div[5].xpath('./div//span/text()')
get_list = [title, address, houseInfo, str(totalPrice), str(unitPrice)]
csv_writer.writerow(get_list)
print(url, 'success')
if __name__ == '__main__':
with open('data.csv', mode='w', encoding='utf-8', newline='') as f: # 写一个csv
csv_writer = csv.writer(f)
with ThreadPoolExecutor(50) as t: # 创建线程池
for item in range(1, 101): # 爬取1到100页的数据
t.submit(get_page,
url=f"https://nanchong.lianjia.com/ershoufang/pg{item}rs%E5%8D%97%E5%85%85/") # 以南充市为例爬取二手房信息