厦门房价链家爬虫
python爬取链家官网上厦门二手房数据:
- 设置Headers
- 房源列表爬取
- 房源信息爬取
- 爬虫主程序
- main程序
设置Headers
防止封IP,cookies设置
def url_open(url):
""" url open """
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'select_city=350200; lianjia_uuid=af9ec94f-c262-4479-87fe-bae33a973984; UM_distinctid=15fc24977a8912-0ceeebd236ea8-5d153b16-15f900-15fc24977a9a45; all-lj=eae2e4b99b3cdec6662e8d55df89179a; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1510793116,1510793631; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1510794363; _smt_uid=5a0cdf9b.52d4b67b; CNZZDATA1255847100=477360082-1510791356-%7C1510791356; CNZZDATA1254525948=367678637-1510790754-%7C1510790754; CNZZDATA1255633284=1024567459-1510792602-%7C1510792602; CNZZDATA1255604082=1214039791-1510789438-%7C1510789438; _ga=GA1.2.973757450.1510793121; _gid=GA1.2.782279088.1510793121; lianjia_ssid=39d151c0-84a0-4b13-b468-b54f55c717f6',
'Host': 'xm.lianjia.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'
}
while 1:
try:
req = requests.get(url=url, headers=headers, timeout=3)
break
except:
print('timeout')
time.sleep(1)
html = req.content
return html
房源列表爬取
汇总各个房源信息网址,去重
def get_links(url):
""" get link """
print(url)
html = url_open(url)
# 解析html
soup = BeautifulSoup(html, 'html.parser')
# 定位父级<div>
divs = soup('div', {'class': 'info clear'})
# 拿到每一个子div中的house
house_urls = [div.find('a')['href'] for div in divs]
return house_urls
房源信息爬取
房源信息爬取,返回字符串
字典形式返回在多进程下报错,why?
def get_infos(url):
""" house information """
print(url)
html = url_open(url)
soup = BeautifulSoup(html, 'html.parser')
house_info = {}
soup_area = soup('div', {'class': 'areaName'})[0]('a')
house_info['辖区'] = soup_area[0].string
house_info['区域'] = soup_area[1].string
house_info['小区'] = soup('a', {'class': 'info '})[0].string
house_info['建造年份'] = soup('div', {'class': 'area'})[0](
'div', {'class': 'subInfo'})[0].string.split('/')[0]
position = re.findall("resblockPosition:'(.+),(.+)'",
html.decode('utf-8'))[0]
house_info['经度'] = position[0]
house_info['纬度'] = position[1]
house_info['总价'] = soup('span', {'class': 'total'})[0].string
house_info['均价'] = soup('span', {'class': 'unitPriceValue'})[0].contents[0]
for name in ['base', 'transaction']:
for li in soup('div', {'class': name})[0]('li'):
contents = [content.string.strip() for content in li.contents if content.string.strip()]
house_info[contents[0]] = contents[1]
# for name in ['tags clear', 'baseattribute clear']:
# soup_baseinforms = soup('div', {'class': name})
# for soup_baseinform in soup_baseinforms:
# key = soup_baseinform('div', {'class': 'name'})[0].string
# value = soup_baseinform('div', {'class': 'content'})[0].get_text()
# house_info[key] = value.strip()
# soup_rows = soup('div', {'id': 'infoList'})[0]('div', {'class': 'row'})
# for soup_row in soup_rows:
# layout = [s.string for s in soup_row('div')]
# for i,name in enumerate(['面积', '朝向', '窗户']):
# house_info[layout[0] + name] = layout[i + 1]
return str(house_info)
爬虫主程序
- 汇总房源列表,去重
- 多进程,爬取房源信息
def download_house():
""" download house"""
# links
urls = ['http://xm.lianjia.com/ershoufang/pg%d/' %
page for page in range(1, 101)]
pool = Pool()
house_links = pool.map(get_links, urls)
# pool.close()
# pool.join()
# infos
urls = set(reduce(lambda x, y: x + y, house_links))
# pool = Pool()
house_infos = pool.map(get_infos, urls)
pool.close()
pool.join()
house_data = [eval(info) for info in house_infos]
house_df = pd.DataFrame(house_data)
excel_name = r"house.xls"
house_df.to_excel(excel_name, index=False)
main程序
if __name__ == '__main__':
download_house()