本文主要使用BeautifulSoup及re模块对网页内容解析,提取新房楼盘信息主要为名称、位置、价格
代码如下
# coding=utf-8
import os,json,urllib2,requests,zlib,refrom bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
''' 从新房首页获取城市列表,此处获取不全 '''
def get_text(url):
res=urllib2.urlopen(url)
content=zlib.decompress(res.read(),16+zlib.MAX_WBITS)
soup=BeautifulSoup(content,'lxml')
city_list=soup.find_all('div',class_='city20141104nr',style='display:none')
lis=[]
dic_city={}
for i in city_list:
#print len(i.find_all('a'))
lis+=i.find_all('a')
for i in lis:
city_name=str(i).split('>')[1].encode('gbk').strip('</a')
dic_city[city_name]=i['href']
return dic_city
'''获取各城市所有页面'''
def get_city_page(city_url):
res=urllib2.urlopen(city_url)
content=zlib.decompress(res.read(),16+zlib.MAX_WBITS)
#html = re.sub("</html>","",content,flags=re.S|re.IGNORECASE)+"</html>"
soup=BeautifulSoup(conte