简介
对某房源网站进行信息抓取
流程及思路
一是请求网页,获取HTML代码。
二是解析代码获取房源信息,包括地址,每日价格等等。
三是下载这些数据并依此保存在excel里面 。
# _*_ coding: utf-8 _*_
import requests #导入网页请求库
import re #导入正则表达式库
from bs4 import BeautifulSoup #导入网页解析库
import xlwt
import xlrd
import parser
import os
from xlutils.copy import copy
def start_requests(url):
headers = {
'User - Agent': 'Mozilla / 5.0(Windows NT 6.1;WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 63.0.3239.132 Safari / 537.36'
}
response = requests.get(url, headers=headers)
# print(response.status_code)#
return response.content.decode()
#
def get_info(url):
global count,r_xls
text = start_requests(url)
soup = BeautifulSoup(text, 'lxml')
titles=soup.select('div.pho_info>h4>em')
#print(titles[0].string.strip())
addresses=soup.select('span.pr5')
#print(addresses[0].string.strip())
prices = soup.select('#pricePart >div> span')
# print(prices[0].string.strip())
for title,address,price in zip(titles,addresses,prices):
data={
'房源{}'.format(count): title.string.strip(),
'地址': address.string.strip(),
'价格': price.string.strip()
}
# print(data)#打印房源信息
r_xls = xlrd.open_workbook('xian.xls')
excel=copy(r_xls)
table=excel.get_sheet(0)
row=count
table.write(0, 0, '序号')
table.write(0, 1, '房源')
table.write(0, 2, '地址')
table.write(0, 4, '价格')
table.write(0, 3, '链接')
table.write(row, 0, row)
table.write(row, 1, title.string.strip())
table.write(row, 2, address.string.strip())
table.write(row, 4, int(price.string.strip()))
table.write(row, 3, url)
excel.save('xian.xls')
#将房源信息导入excel中
count += 1
#获取所有的住房链接
def get_links(url):
text= start_requests(url)
soup=BeautifulSoup(text,'lxml')
links=soup.select('#page_list>ul>li>a')
for link in links:
# print(link['href'])#获取所有的住房链接
get_info(link['href'])#调用函数获取链接中房源的信息
#主调函数入口
if __name__ == '__main__':
count = 1#声明一个全局变量,记录房源数量
workbook = xlwt.Workbook(encoding='utf-8')
sheet1 = workbook.add_sheet('西安')
#
workbook.save('xian.xls')
urls =['http://xa.xiaozhu.com/search-duanzufang-p{}-0/'.format(number) for number in range(1,5)]
# print(urls)
for url in urls:
print(url)
get_links(url)
#
代码写的比较水,可以优化。
在Win10的64位系统,Python3.7中已测试过,运行正常。