Python抓取数据并存入到mysql

 
#!/usr/bin/env python
#coding:utf-8
'''
Created on Jul 21, 2013

@author: belong
'''

import urllib
import re
import datetime
import MySQLdb
import time

start_url = "https://xively.com/search?q=temperature&category="

'''
工具类
'''
class Tools:
    def write_log(self, level, info):
        print '[' + level + ']' + info
    
    #正则表达式匹配,返回值:匹配结果(字符串类型)
    #pattern表示正则表达式,single为布尔型变量,       
    def match_regex(self, pattern, src, single):
        result = ""
        com = re.compile(pattern, re.I|re.DOTALL)
        matchers = com.findall(src)
        for matcher in matchers:
            result += matcher + " "
            if single:  # 如果只需要一个则结束
                break
        return result.strip()  #strip函数返回去掉两端空格的字符串
    
    #将其他字符集转化为unicode字符集,返回值:unicode解码得到的字符串
    #unicode相当与asciistring.decode('ascii')
    def to_unicode(self, s, charset):
        if charset != "":
            s = unicode(s, charset)
        return s
    
'''**********************************************************'''       
'''
url爬取,页面获取类
'''          
class Crawler:
    # urlùurls
    def get_url(self, url):
        fp = urllib.urlopen(url)
    #     pattern = re.compile("http://.*?\.com",re.I)
#         pattern = re.compile(r'[a-zA-z]+://[^\s]*\.html?', re.I)
        pattern = re.compile(r'https://xively.com/feeds/\d+', re.I)
        while True:
            data = fp.read()
            if data:
                urls = pattern.findall(data)
            else:
                break
        fp.close()
        return urls
    
    #获取html页面
    def get_page(self,url):
        fp = urllib.urlopen(url)#获取socket._fileobject object对象
        page = fp.read()#读取整个html页面的字符串
        return page
    
    # 下载url对应网页
    def download_url(self, url, filename):
        Tools().write_log("Info", "开始下载")
        f = open(filename, 'w')#以写的方式打开文件filename
        f.write(self.get_page(url));#将页面字符串写入文件中
        f.close()
        Tools().write_log("Info", "网页下载成功")
    
    # 广度优先遍历
    def broad_traverse(self, start_url, number):
        print "\n"*2
        print '====='*20
        Tools().write_log("Info", "开始遍历")
        start_time = datetime.datetime.now()#遍历开始时间
        visited = []#存放已访问过的url的列表
        unvisited = []#存放未访问过的url的列表
        unvisited.append(start_url)
        while len(unvisited):#当未访问的url数目不为0时,执行
            if len(visited) < number:#当已经访问的url数目少于预设值时,执行
                url = unvisited.pop(0);#pop(i)返回值:索引为i的元素,并将其从列表中删除
                print "---"*20
                print url, len(visited),len(unvisited)
                i = len(visited)
                self.download_url(url, str(i) + '.html')#将url整个页面下载本地硬盘
                visited.append(url)#加入已访问的url列表
                url_list = self.get_url(url)#提取url页面的其他所有url链接,有重复
                for eachlink in url_list:
                    if ((unvisited.count(eachlink) == 0) & (visited.count(eachlink) == 0)):
                        unvisited.append(eachlink)#具有唯一性的url加入问访问的url列表
            else:
                break
        Tools().write_log("Info", "遍历成功")
        end_time = datetime.datetime.now()#遍历结束时间
        interval = end_time - start_time
        Tools().write_log("Info", "遍历时间:%s" % (interval))
        print "====="*20
        print "\n"*2
        return visited 

'''**********************************************************'''       
'''
内容提取类
'''
class DataExtractor():
    
    # 1设备id
    def get_id(self, data):
        device_id = Tools().match_regex(r'id:(\d*?),', data, False)
        return device_id
        
    # 2设备title
    def get_title(self, data):
        title = Tools().match_regex(r'title:(.*?),p', data, False)
        return title
    
    # 3设备描述
    def get_description(self, data):
        description = Tools().match_regex(r'description:(.*?),', data, False)
        return description
    
    # 4设备状态
    def get_status(self, data):
        status = Tools().match_regex(r'status:(.*?),', data, True)
        return status
    
    # 5设备位置
    def get_location(self, data):
        location = Tools().match_regex(r'location:{(.*?)}', data, True)
        return location
    
    # 获取页面字符串编码类型
    def get_charset(self, data):
        charset = Tools().match_regex(r'charset="(.*?)"', data, True)
        return charset
    
    # 设备全部信息
    def get_info(self, url):
        try:
            data = Crawler().get_page(url)
            charset = self.get_charset(data)
            data = Tools().to_unicode(data, charset)#将页面unicode解码
            data = Tools().match_regex(r'JSON.parse(.*?);', data, False)
            data = re.sub(r'\\\"', "", data)
            print data
        except:
            Tools().write_log("Error", url + "获取页面失败")
            raise
        rst = {}  # rst字典
        rst['id'] = self.get_id(data)
        rst['title'] = self.get_title(data)
        rst['description'] = self.get_description(data)
        rst['status'] = self.get_status(data)
        rst['location'] = self.get_location(data)
        return rst
    
    def get_all(self,url_list):
        rst_list = []
        for url in url_list:
            rst = self.get_info(url)
            rst_list.append(rst)
        return rst_list
    
'''**********************************************************'''               
'''
存入数据库
'''
class DBAdapter:

    def db_connect(self):
        conn = MySQLdb.connect(host = "localhost",
                               user = "root",
                               passwd = "123456",
                               charset = "utf8")
        return conn
    
    def run(self, url_list):
        conn = self.db_connect()#mysql
        cur = conn.cursor()#获取游标
#         sql1 = "CREATE DATABASE db_test DEFAULT CHARSET utf8 COLLATE utf8_general_ci"
#         cur.execute(sql1)#utf8db_test

        #选择数据库
        conn.select_db('db_test')
        
        sql1 = "drop table pageinfo"
        cur.execute(sql1)
        
        #创建表pageinfo
        sql2 = "create table if not exists pageinfo(\
                    id int,\
                    url varchar(80),\
                    device_id varchar(30),\
                    title varchar(80),\
                    description varchar(100),\
                    status varchar(10),\
                    location varchar(200))"
        cur.execute(sql2)
        
        count = 0
        rst_list = DataExtractor().get_all(url_list)
        sql3 = "insert into pageinfo values(%s,%s,%s,%s,%s,%s,%s)"
        for i in range(0,len(rst_list)):
            param = (count,
                     url_list[i],
                     rst_list[i]['id'],
                     rst_list[i]['title'],
                     rst_list[i]['description'],
                     rst_list[i]['status'],
                     rst_list[i]['location'])
            cur.execute(sql3,param)#param是元组
            count += 1
        print "\n"*2
        print '====='*20
        print "获取所有结果:"
        cur.execute('select * from pageinfo')
          
        #重置游标位置,0为偏移量,mode=absolute | relative,默认为relative,   
        cur.scroll(0,mode='absolute')  
        results = cur.fetchall()
        for r in results:
            print '%s\t%s\t%s\t%s\t%s\t%s\t%s' % r
        
        cur.close()
        conn.commit()        
        conn.close()
        
'''**********************************************************'''       
'''
主函数
'''         
def main():
    begin = time.time()#程序运行起始时刻
    url_list = Crawler().broad_traverse(start_url, 5)#获取遍历之后的url列表
    dbAdapter = DBAdapter()#实例化DBAdapter类
    dbAdapter.run(url_list)
    end = time.time()#程序结束时刻
    print "总共消耗时间: ",(end-begin)

if __name__ == "__main__":
    main()

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值