#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding("utf-8") #设定编码要放在最上面。之前放在引用库的最下面,然后加到pandas的DataFrame老是出现中文字符为问号的乱码
import pandas as pd
import urllib2
import urllib
import time
import re
from bs4 import BeautifulSoup
#读取指定网页的内容
myurl="http://hz.lianjia.com/ershoufang/pg"+str(1)
req = urllib2.Request(myurl)
myResponse = urllib2.urlopen(req)
myPage = myResponse.read()
unmyPage = myPage.decode('utf-8') #转换为Unicode类型
#匹配房源的总价
c1=re.findall('<div.*?class="totalPrice".*?><span>(.*?)</span>(.*?)</div>',unmyPage,re.S)
totalPrice=[]
for item in c1:
newitem=item[0]+item[1]
newitem=str(newitem)
totalPrice.append(newitem)
#匹配房源信息
c2=re.findall('data-el="region">(.*?)</div>',unmyPage,re.S)
houseinfo=[]
for item in c2:
#item=item.encode('utf-8')
#print isinstance(item,str)
houseinfo.append(item)
#匹配房源关注度
c3=re.findall('<span.*?class="starIcon"></span>(.*?)</div>',unmyPage,re.S)
followinfo=[]
for item in c3:
followinfo.append(item)
house=pd.DataFrame({'totalprice':totalPrice,'houseinfo':houseinfo,
'followinfo':followinfo})
print type(house['totalprice'][0]) #str
print type(house['houseinfo'][0]) #unicode
print type(house['followinfo'][0]) #unicode
print house.head()
2、运行结果:
G:\python2.7\python.exe E:/python37/jiebacut_01/DateCrawler/lianjia/lianjia2.py
<type 'str'>
<type 'unicode'>
<type 'unicode'>
followinfo ... totalprice
0 173人关注 / 共58次带看 / 3个月以前发布 ... 950万
1 238人关注 / 共33次带看 / 6个月以前发布 ... 480万
2 164人关注 / 共19次带看 / 3个月以前发布 ... 270万
3 156人关注 / 共11次带看 / 11个月以前发布 ... 585万
4 70人关注 / 共26次带看 / 3个月以前发布 ... 250万
[5 rows x 3 columns]
Process finished with exit code 0