day1 爬虫作业
"""
创建者:卢俊杰
创建时间:2023/1/3,17:45
"""
import re
from lxml import etree
import requests
def get_info(response_text):
tree = etree.HTML(response_text)
list_location = []
list_prices = []
list_area = []
for i in range(1, 31):
locations_fragment = []
for j in range(1, 4):
locations_fragment += tree.xpath(f'//*[@id="content"]/div[1]/div[1]/div[{i}]/div/p[2]/a[{j}]/text()')
location = '-'.join(locations_fragment)
list_location.append(location)
for i in range(1, 31):
list_prices.append(tree.xpath(f'//*[@id="content"]/div[1]/div[1]/div[{i}]/div/span/em/text()')[0])
for i in range(1, 31):
area = ''.join(tree.xpath(f'//*[@id="content"]/div[1]/div[1]/div[{i}]/div/p[2]/text()'))
list_area.append(re.search(r'\d+\.\d*', area).group())
infos = list(map(lambda x, y, z: {'location': x, 'price': y, 'area': z}, list_location, list_prices, list_area))
return infos
head = {
'user-agent': 'Mozilla/4.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/102.0.5005.136 Safari/537.36 '
}
all_infos = []
for i in range(10):
response = requests.get(f'https://cd.zu.ke.com/zufang/pg{i}/#contentList', headers=head)
response_text = response.text
all_infos.append(get_info(response_text))
print(all_infos)