1. 请求设置与目标URL
本代码使用 requests 库向多个城市的租房信息页面发送 GET 请求。目标URL为链家租房页面,不同城市的页面以 urls 列表的形式保存,每个 URL 包含页码参数(pg{}),这允许代码通过循环爬取不同的分页内容。
使用 User-Agent 和 Cookie 模拟浏览器请求头,避免反爬虫机制。
利用 fake_useragent 库生成随机的 User-Agent,使请求看起来更像是来自真实的浏览器。
headers = {
"Cookie": "lianjia_uuid=cc7d8e7e-0466-449e-8902-591fd5690ebc; ...",
"Referer": "https://bj.lianjia.com/",
"User-Agent": "Mozilla/5.0 ... Safari/537.36",
}
2. 数据爬取与解析
通过循环爬取不同城市的租房页面,每个城市随机爬取 10 到 20 页数据,以规避反爬虫策略。每次爬取后加入 time.sleep() 随机休眠,以模拟人工访问,降低被封禁的风险。
利用 BeautifulSoup 解析网页数据,获取每个房屋的相关信息,包括链接、图片、标题、位置、房屋类型、标签、品牌、发布日期、价格等。
使用了嵌套查找方式(例如 .find() 和 .find_all())获取指定的HTML标签和属性中的内容。
li_elements = bs.find_all("div", class_="content__list--item")
for li_element in li_elements:
item_url = li_element.find('a', class_='content__list--item--aside')['href']
item_img = li_element.find('img', class_='lazyload')['data-src']
item_title = li_element.find('a', class_='twoline').text.strip()
# 其余信息提取同理
3. 城市与省份映射
代码中提供了 city_province_map 字典,用于将城市的缩写(如 sz)映射到具体的城市名和省份名。这使得在生成最终的数据时,能够更好地显示出租房信息的所属地区。
city_abbr = url.split('/')[2].split('.')[0]
city, province = city_province_map.get(city_abbr, ('未知城市', '未知省份'))
4. 数据保存与异常处理
将解析得到的数据格式化并保存到 zufang.txt 文件中。每个房屋的信息按字段使用 # 分隔符,最后以换行符结束,并将其写入文本文件中。
使用 try-except 可以改进代码的异常处理,当前代码略显简单,未对数据解析失败进行捕获,但可以加入处理机制以跳过错误数据并继续爬取下一个数据。
line = f"{province}#{city}#{item_url}#{item_img}#{item_title}#{item_location}#{item_house}#{item_brand}#{item_date}#{item_price}#{item_unit}\n"
with open('zufang.txt', 'a+', encoding='utf-8') as f:
f.write(line)
5. 随机休眠以防反爬虫
代码中多次使用了 time.sleep(random.randint()) 来模拟真实用户的浏览行为,减少爬虫频繁请求的可能性。爬取每页数据后随机休眠 10 到 15 秒,爬取完一个城市后再长时间休眠 600 到 700 秒
time.sleep(random.randint(10, 15)) # 每次爬取页面后休眠
# ...
time.sleep(random.randint(600, 700)) # 爬取一个城市后长时间休眠
6. 完整的代码
import random
import time
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
urls = [
'https://bj.lianjia.com/zufang/pg{}/#contentList',
'https://sh.lianjia.com/zufang/pg{}/#contentList',
'https://tj.lianjia.com/zufang/pg{}/#contentList',
'https://cq.lianjia.com/zufang/pg{}/#contentList',
'https://sy.lianjia.com/zufang/pg{}/#contentList',
'https://cc.lianjia.com/zufang/pg{}/#contentList',
'https://hz.lianjia.com/zufang/pg{}/#contentList',
'https://hf.lianjia.com/zufang/pg{}/#contentList',
'https://nj.lianjia.com/zufang/pg{}/#contentList',
'https://jn.lianjia.com/zufang/pg{}/#contentList',
'https://wh.lianjia.com/zufang/pg{}/#contentList',
'https://gy.lianjia.com/zufang/pg{}/#contentList',
'https://xa.lianjia.com/zufang/pg{}/#contentList',
'https://lz.lianjia.com/zufang/pg{}/#contentList',
'https://nc.lianjia.com/zufang/pg{}/#contentList',
'https://nn.lianjia.com/zufang/pg{}/#contentList',
'https://hk.lianjia.com/zufang/pg{}/#contentList',
'https://km.lianjia.com/zufang/pg{}/#contentList',
'https://ls.lianjia.com/zufang/pg{}/#contentList',
'https://yinchuan.lianjia.com/zufang/pg{}/#contentList',
'https://wlmq.lianjia.com/zufang/pg{}/#contentList',
'https://sz.lianjia.com/zufang/pg{}/#contentList',
'https://fz.lianjia.com/zufang/pg{}/#contentList',
'https://cs.lianjia.com/zufang/pg{}/#contentList',
'https://cd.lianjia.com/zufang/pg{}/#contentList',
'https://hhht.lianjia.com/zufang/pg{}/#contentList'
]
city_province_map = {
'sz': ('深圳', '广东'),
'fz': ('福州', '福建'),
'cs': ('长沙', '湖南'),
'cd': ('成都', '四川'),
'kf': ('开封', '河南'),
'bj': ('北京', '北京'),
'sh': ('上海', '上海'),
'tj': ('天津', '天津'),
'cq': ('重庆', '重庆'),
'sy': ('沈阳', '辽宁'),
'cc': ('长春', '吉林'),
'hz': ('杭州', '浙江'),
'hf': ('合肥', '安徽'),
'nj': ('南京', '江苏'),
'jn': ('济南', '山东'),
'wh': ('武汉', '湖北'),
'gy': ('贵阳', '贵州'),
'xa': ('西安', '陕西'),
'lz': ('兰州', '甘肃'),
'xg': ('香港', '香港'),
'am': ('澳门', '澳门'),
'tw': ('台湾', '台湾'),
'nc': ('南昌', '江西'),
'nn': ('南宁', '广西'),
'hk': ('海口', '海南'),
'km': ('昆明', '云南'),
'ls': ('拉萨', '西藏'),
'xn': ('西宁', '青海'),
'yinchuan': ('银川', '宁夏'),
'wlmq': ('乌鲁木齐', '新疆'),
'hhht': ('呼和浩特', '内蒙古')
}
headers = {
"Cookie":"lianjia_uuid=cc7d8e7e-0466-449e-8902-591fd5690ebc; _ga=GA1.2.763069493.1713847893; _smt_uid=6627d291.14e57488; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218f0b8e8d41320-00d421fee02f77-26001951-1821369-18f0b8e8d421992%22%2C%22%24device_id%22%3A%2218f0b8e8d41320-00d421fee02f77-26001951-1821369-18f0b8e8d421992%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.google.com.hk%2F%22%2C%22%24latest_referrer_host%22%3A%22www.google.com.hk%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; _ga_QJN1VP0CMS=GS1.2.1713885847.1.0.1713885847.0.0.0; _ga_KJTRWRHDL1=GS1.2.1713885847.1.0.1713885847.0.0.0; _ga_RCTBRFLNVS=GS1.2.1713885855.2.0.1713885855.0.0.0; _ga_PV625F3L95=GS1.2.1713972120.1.0.1713972120.0.0.0; _jzqa=1.3496589561171229700.1713885842.1713972111.1714211778.3; _jzqx=1.1713885842.1714211778.2.jzqsr=google%2Ecom%2Ehk|jzqct=/.jzqsr=cq%2Elianjia%2Ecom|jzqct=/zufang/pg1/; _ga_TJZVFLS7KV=GS1.2.1714211781.1.0.1714211781.0.0.0; _ga_WLZSQZX7DE=GS1.2.1714211781.1.0.1714211781.0.0.0; lianjia_ssid=186e778d-61cc-4c39-b1af-698a3c7a0b6c; hip=Z6jYgvSnlICs21aYofeSkIUtUydKacrkPoo03g6GvkLZ0z6pntdltaVcVedEOwXRBkWjCxcSGMMscdrDzQNJFK8CPMVTtGk2eJ2PQ7jClcE_64iBGwHBR8cinfhGxCCf6cQdEF2AIEF2pR-Eb1JdDSx5SL1HNemRIOoJZUE_YGCWtSMkI_bxddzB-A%3D%3D; select_city=110000; login_ucid=2000000116400363; lianjia_token=2.0011153086727e7d8100b819b78d7a26dd; lianjia_token_secure=2.0011153086727e7d8100b819b78d7a26dd; security_ticket=GnAQ2O8XAFA6XFJDhX9F/f8hdp6dA9wnOfyvAzjtt8R5nuW08s8Dqko/e1jkYRbmIGfkVzrClAsgfIDq567r5dmrE2UrEw2ufK+2OOP4DJ36NV8mbWsH1/TF1L/0mDXbJA5XXjiO+Odeknf3fGbRDzbfSmHBz5sGYb1fg0M5KAI=; ftkrc_=77290a41-5bd2-48e2-8c54-77ff2b0be9eb; lfrc_=d9377256-f913-467e-89fc-9843f710938a",
"Referer":"https://bj.lianjia.com/",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
for url in urls:
top = random.randint(10, 20) # 随机生成 20 到 40 之间的数字
for num in range(1, top):
time.sleep(random.randint(10, 15))
new_url = url.format(num)
city_abbr = url.split('/')[2].split('.')[0] # 提取城市缩写
city, province = city_province_map.get(city_abbr, ('未知城市', '未知省份')) # 根据缩写获取城市和省份
print(new_url)
ua = UserAgent()
headers['User-Agent'] = ua.random
res = requests.get(new_url, headers=headers)
bs = BeautifulSoup(res.text, 'lxml')
li_elements = bs.find_all("div", class_="content__list--item")
for li_element in li_elements:
item_url = li_element.find('a',class_='content__list--item--aside')['href']
item_img = li_element.find('img',class_='lazyload')['data-src']
item_title = li_element.find('a',class_='twoline')
if item_title is not None:
item_title = item_title.text.strip()
item_locations = li_element.find('p', class_='content__list--item--des')
item_location = ''
if item_locations is not None:
item_locations = item_locations.find_all('a')
item_location = " - ".join([str(item_location.text.strip()) for item_location in item_locations])
item_houses = li_element.find('p',class_='content__list--item--des')
item_house = ",".join([''+str(item_house.text.strip()+' ') for item_house in item_houses]).replace('\n',',').replace(',','').replace(' ','').replace('-','').replace(',/','')
item_tags = li_element.find('p',class_='content__list--item--bottom oneline').find_all('i')
item_tag = ",".join([str(item_tag.text.strip()) for item_tag in item_tags])
item_brand = li_element.find('span',class_='brand')
if item_brand is not None:
item_brand = item_brand.text.strip()
item_date = li_element.find('span',class_='content__list--item--time oneline')
if item_date is not None:
item_date = item_date.text.strip()
item_locations = li_element.find('p', class_='content__list--item--des')
item_price_content = li_element.find('span',class_='content__list--item-price').text.strip()
item_price = item_price_content.split(' ')[0]
item_unit = item_price_content.split(' ')[1]
line = f"{province}#{city}#{item_url}#{item_img}#{item_title}#{item_location}#{item_house}#{item_brand}#{item_date}#{item_price}#{item_unit}\n"
print(line)
with open('zufang.txt', 'a+', encoding='utf-8') as f:
f.write(line)
time.sleep(random.randint(600, 700))
7. 部分数据截图