import requests,re,openpyxl,os
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36',
}
def crawing(page):
'''爬取指定页数的信息'''
try:
print('正在爬取第'+page+'页信息...')
url = 'http://bj.58.com/dashanzi/chuzu/pn'+page+'/?ClickID=1'
res = requests.get(url,headers=headers)
html = res.content.decode('utf-8')
except Exception as err:
print("爬取失败,原因是:"+str(err))
#定义查找标题、图片、户型、价格的正则表达式
title_pat='
.*?(.*?) '
pic_pat = 'lazy_src="(.*?)"'
room_pat = '
(.*?) (.*?)
'price_pat = '