运行代码需要安装模块:
prettytable
详细代码:
#coding: gbk
import urllib
import urllib2
import re
from prettytable import PrettyTable
def getMain(url):
str = getRead(url)
x = PrettyTable(['楼号','用途','批准销售套数','批准销售面积','已售套数','已售面积','可售套数','可售面积'],encoding='GBK')
recom = re.compile('<a href="(\w{5}\.\w{5}\?\w{3}=\w{8}-\w{4}-\w{4}-\w{4}-\w{12})">(.{1,20})</a>')
urls = recom.findall(str)
for gurl in urls:
x.add_row(getInfo("http://www.jnfdc.gov.cn/onsaling/" + gurl[0],gurl[1].decode('utf-8').encode('gbk')))
print x
def getInfo(url,buildNo):
str = getRead(url)
#用途 批准销售套数 批准销售面积 已售套数 已售面积 可售套数 可售面积
re1 = re.compile('<td width="10%" style="text-align:left;padding-left: 5px">(.{1,9})</td>')
s1 = re1.findall(str)
re2 = re.compile('<td width="17%" style="text-align:right;padding-right: 10px">(.{1,9})</td>')
s2 = re2.findall(str)
re3 = re.compile('<td width="14%" style="text-align:right;padding-right: 10px">(.{1,9})</td>')
s3 = re3.findall(str)
return [buildNo,s1[0].decode('utf-8').encode('gbk'),s2[0],s2[1],s3[0],s3[1],s3[2],s3[3]]
def getRead(url):
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read()
if __name__ == "__main__":
getMain('http://www.jnfdc.gov.cn/onsaling/show.shtml?prjno=1c20c90d-390e-4445-8b01-44b85968d5fb')
需要注意的是: getMain传递的是 ->在售楼盘->项目名称->具体项目 的URL