#!/usr/bin/env python
#coding:utf-8
'''
Created on Jul 21, 2013
@author: belong
'''
import urllib
import re
import datetime
import MySQLdb
import time
start_url = "https://xively.com/search?q=temperature&category="
'''
工具类
'''
class Tools:
def write_log(self, level, info):
print '[' + level + ']' + info
#正则表达式匹配,返回值:匹配结果(字符串类型)
#pattern表示正则表达式,single为布尔型变量,
def match_regex(self, pattern, src, single):
result = ""
com = re.compile(pattern, re.I|re.DOTALL)
matchers = com.findall(src)
for matcher in matchers:
result += matcher + " "
if single: # 如果只需要一个则结束
break
return result.strip() #strip函数返回去掉两端空格的字符串
#将其他字符集转化为unicode字符集,返回值:unicode解码得到的字符串
#unicode相当与asciistring.decode('ascii')
def to_unicode(self, s, charset):
if charset != "":
s = unicode(s, charset)
return s
'''**********************************************************'''
'''
url爬取,页面获取类
'''
class Crawler:
# urlùurls
def get_url(self, url):
fp = urllib.urlopen(url)
# pattern = re.compile("http://.*?\.com",re.I)
# pattern = re.compile(r'[a-zA-z]+://[^\s]*\.html?', re.I)
pattern = re.compile(r'https://xively.com/feeds/\d+', re.I)
while True:
data = fp.read()
if data:
urls = pattern.findall(data)
else:
break
fp.close()
return urls
#获取html页面
def get_page(self,url):
fp = urllib.urlopen(url)#获取socket._fileobject object对象
page = fp.read()#读取整个html页面的字符串
return page
# 下载url对应网页
def download_url(self, url, filename):
Tools().write_log("Info", "开始下载")
f = open(filename, 'w')#以写的方式打开文件filename
f.write(self.get_page(url));#将页面字符串写入文件中
f.close()
Tools().write_log("Info", "网页下载成功")
# 广度优先遍历
def broad_traverse(self, start_url, number):
print "\n"*2
print '====='*20
Tools().write_log("Info", "开始遍历")
start_time = datetime.datetime.now()#遍历开始时间
visited = []#存放已访问过的url的列表
unvisited = []#存放未访问过的url的列表
unvisited.append(start_url)
while len(unvisited):#当未访问的url数目不为0时,执行
if len(visited) < number:#当已经访问的url数目少于预设值时,执行
url = unvisited.pop(0);#pop(i)返回值:索引为i的元素,并将其从列表中删除
print "---"*20
print url, len(visited),len(unvisited)
i = len(visited)
self.download_url(url, str(i) + '.html')#将url整个页面下载本地硬盘
visited.append(url)#加入已访问的url列表
url_list = self.get_url(url)#提取url页面的其他所有url链接,有重复
for eachlink in url_list:
if ((unvisited.count(eachlink) == 0) & (visited.count(eachlink) == 0)):
unvisited.append(eachlink)#具有唯一性的url加入问访问的url列表
else:
break
Tools().write_log("Info", "遍历成功")
end_time = datetime.datetime.now()#遍历结束时间
interval = end_time - start_time
Tools().write_log("Info", "遍历时间:%s" % (interval))
print "====="*20
print "\n"*2
return visited
'''**********************************************************'''
'''
内容提取类
'''
class DataExtractor():
# 1设备id
def get_id(self, data):
device_id = Tools().match_regex(r'id:(\d*?),', data, False)
return device_id
# 2设备title
def get_title(self, data):
title = Tools().match_regex(r'title:(.*?),p', data, False)
return title
# 3设备描述
def get_description(self, data):
description = Tools().match_regex(r'description:(.*?),', data, False)
return description
# 4设备状态
def get_status(self, data):
status = Tools().match_regex(r'status:(.*?),', data, True)
return status
# 5设备位置
def get_location(self, data):
location = Tools().match_regex(r'location:{(.*?)}', data, True)
return location
# 获取页面字符串编码类型
def get_charset(self, data):
charset = Tools().match_regex(r'charset="(.*?)"', data, True)
return charset
# 设备全部信息
def get_info(self, url):
try:
data = Crawler().get_page(url)
charset = self.get_charset(data)
data = Tools().to_unicode(data, charset)#将页面unicode解码
data = Tools().match_regex(r'JSON.parse(.*?);', data, False)
data = re.sub(r'\\\"', "", data)
print data
except:
Tools().write_log("Error", url + "获取页面失败")
raise
rst = {} # rst字典
rst['id'] = self.get_id(data)
rst['title'] = self.get_title(data)
rst['description'] = self.get_description(data)
rst['status'] = self.get_status(data)
rst['location'] = self.get_location(data)
return rst
def get_all(self,url_list):
rst_list = []
for url in url_list:
rst = self.get_info(url)
rst_list.append(rst)
return rst_list
'''**********************************************************'''
'''
存入数据库
'''
class DBAdapter:
def db_connect(self):
conn = MySQLdb.connect(host = "localhost",
user = "root",
passwd = "123456",
charset = "utf8")
return conn
def run(self, url_list):
conn = self.db_connect()#mysql
cur = conn.cursor()#获取游标
# sql1 = "CREATE DATABASE db_test DEFAULT CHARSET utf8 COLLATE utf8_general_ci"
# cur.execute(sql1)#utf8db_test
#选择数据库
conn.select_db('db_test')
sql1 = "drop table pageinfo"
cur.execute(sql1)
#创建表pageinfo
sql2 = "create table if not exists pageinfo(\
id int,\
url varchar(80),\
device_id varchar(30),\
title varchar(80),\
description varchar(100),\
status varchar(10),\
location varchar(200))"
cur.execute(sql2)
count = 0
rst_list = DataExtractor().get_all(url_list)
sql3 = "insert into pageinfo values(%s,%s,%s,%s,%s,%s,%s)"
for i in range(0,len(rst_list)):
param = (count,
url_list[i],
rst_list[i]['id'],
rst_list[i]['title'],
rst_list[i]['description'],
rst_list[i]['status'],
rst_list[i]['location'])
cur.execute(sql3,param)#param是元组
count += 1
print "\n"*2
print '====='*20
print "获取所有结果:"
cur.execute('select * from pageinfo')
#重置游标位置,0为偏移量,mode=absolute | relative,默认为relative,
cur.scroll(0,mode='absolute')
results = cur.fetchall()
for r in results:
print '%s\t%s\t%s\t%s\t%s\t%s\t%s' % r
cur.close()
conn.commit()
conn.close()
'''**********************************************************'''
'''
主函数
'''
def main():
begin = time.time()#程序运行起始时刻
url_list = Crawler().broad_traverse(start_url, 5)#获取遍历之后的url列表
dbAdapter = DBAdapter()#实例化DBAdapter类
dbAdapter.run(url_list)
end = time.time()#程序结束时刻
print "总共消耗时间: ",(end-begin)
if __name__ == "__main__":
main()