# coding:utf-8
import requests
import re
import random
from bs4 import BeautifulSoup
from w3lib.html import remove_tags
import csv
useragents = [ # 代理用户
"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13",
"Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 ",
"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 ",
"Mozilla/5.0 (Linux; U; Android 3.2; ja-jp; F-01D Build/F0001) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13 ",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5 ",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9 ",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"
]
header = {
"User-Agent": random.choice(useragents),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0",
"Accept-Encoding": "gzip, deflate,br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
}
area = {'tianhe', 'yuexiu', 'liwan', 'haizhu', 'panyu', 'baiyun', 'huangpugz', 'conghua', 'zengcheng', 'huadou',
'nansha'}
for aa in area:
c = 0
for bb in range(1, 100):
tar_url = "https://gz.lianjia.com/zufang/" + aa + "/pg" + str(bb) #某一地址的第几页数据
response = requests.get(url=tar_url, headers=header)
soup = BeautifulSoup(response.text, 'html.parser')
#print(tar_url) #
#print(soup)
imf = soup.find_all('div', class_='content__list--item--main') #每个房子的信息模块
try:
for house in imf:
#print(house)
region = house.find('p', {'class': 'content__list--item--title twoline'}).find('a').get_text() # 地址
#print(region)
#拼接出某个房子的详情连接
detailURL = 'https://gz.lianjia.com' + house.find('p', {'class': 'content__list--item--title twoline'}).find('a')['href']
#print(detailURL)
zone = house.find('p', {'class': 'content__list--item--des'}).find('a').get_text() # 所属区域
#print(zone)
ditie = house.find('p', {'class': 'content__list--item--des'}).findAll('a')[1].get_text() # 邻近的地铁站为
#print(ditie)
xiaoqu = house.find('p', {'class': 'content__list--item--des'}).findAll('a')[2].get_text() # 所属小区
#print(xiaoqu)
daxiao = house.find('p', {'class': 'content__list--item--des'}) # 所属区域
list = []
a=1
for da in daxiao:
#print('123',da)
a+=1
if a == 10:
print('Mianji,', da)
list.append(da)
if a == 14:
#if da.find('㎡') != -1: #面积大小
list.append(da)
#print(str(list[0]).replace('\n ',''))
mianji = str(list[0]).replace('\n ','') # 房子面积
tishi = str(list[1]).replace('\n ','') #庭室
#print('面积和庭室:',list)
price = house.find('span', {'class': 'content__list--item-price'}).find('em').get_text() + '元/月' #房租
#print(price)
weihu = house.find('span', {'class': 'content__list--item--time oneline'}).get_text() #维护状态
#print(weihu)
print('房子简介:',region,';所属地区:',zone,';小区:',xiaoqu,';庭室:',tishi,';面积:',mianji,';价格:',price,';临近地铁为:',ditie,':详情连接:', detailURL)
#dis = house.find('div', {'class': 'con'}).get_text()
#t = str(region) + str(zone) + str(s) + str(price) + ' ' + str(dis)
#print(t)
#with open("F://pc/链家广州租房全xinde.txt", "a", encoding='utf-8')as f:
with open('租房.csv', 'a') as f:
csv_writer = csv.writer(f, delimiter=',')
c+=1
csv_writer.writerow([str(c), str(region), str(zone), str(xiaoqu), str(tishi), str(mianji), str(price), str(ditie), str(detailURL)])
except Exception as ex:
print("出现如下异常%s"%ex)
continue
Python3爬取某租房网数据并保存到Excel文件中(完整源码)
最新推荐文章于 2024-04-23 14:37:36 发布