1、直接上代码:
#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import pandas as pd
import urllib2
import urllib
import time
import re
from bs4 import BeautifulSoup
totalPrice=[]
houseinfo=[]
followinfo=[]
for i in range(1,3):
#读取指定网页的内容
myurl="http://hz.lianjia.com/ershoufang/pg"+str(i)
req = urllib2.Request(myurl)
myResponse = urllib2.urlopen(req)
myPage = myResponse.read()
unmyPage = myPage.decode('utf-8')
#匹配房源的总价
c1=re.findall('<div.*?class="totalPrice".*?><span>(.*?)</span>(.*?)</div>',unmyPage,re.S)
for item in c1:
newitem=item[0]+item[1]
totalPrice.append(newitem)
#print newitem
#匹配房源信息
c2=re.findall('data-el="region">(.*?)</div>',unmyPage,re.S)
for item in c2:
item=re.sub('</a>','',item) #将</a>替换掉
houseinfo.append(item)
#匹配房源关注度
c3=re.findall('<span.*?class="starIcon"></span>(.*?)</div>',unmyPage,re.S)
for item in c3:
followinfo.append(item)
house=pd.DataFrame({'totalprice':totalPrice,'houseinfo':houseinfo,
'followinfo':followinfo})
print house,house.index
print house.head()
#使用pandas对数据进行清洗
houseinfo_split = pd.DataFrame((x.split('|') for x in house.houseinfo),index=house.index,
columns=['小区','户型','面积','朝向','装修','电梯'])
print houseinfo_split
运行结果:
G:\python2.7\python.exe E:/python37/jiebacut_01/DateCrawler/lianjia/lianjia3.py