python 采集调类入库

采集页面
# -*- coding: UTF-8 -*-
print
from conf import *
import urllib
import urllib2
import MySQLdb
import re
res = MysqldbHelper()


class NewsTitle:
    #init
    def __init__(self):
        self.url = "http://news.baidu.com/"

    #convert div to ''
#    def tranTags(self, x):
#       pattern = re.compile('<div.*?</div>')
#       pattern = re.compile('<img.*?>')
#       res = re.sub(pattern, '', x)
#       return res
#      去除span标签       
#    def clerSpan(self, x):
#        pattern = re.compile('<span.*?</span>')
#        res = re.sub(pattern, '', x)
#        return res
#        去除JavaScript
#    def clerjav(self, x):
#        pattern = re.compile('javascript:.*?;')
#        res = re.sub(pattern, '', x)
#        return res

    def getPage(self):
        url = self.url
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        return response.read()

    def getTitle(self):
        page = self.getPage()
        pattern = re.compile('(<div id="pane-news" .*?)<div id="footerwrapper">',re.S)
        tit = re.search(pattern,page)
        patterncode = re.sub(r'<a .*?><img .*?</a>','',tit.group(1))
        patterncode = re.sub(r'<a .*?>\n<img .*?\n</a>','',patterncode)
        return patterncode
  
    def getHref(self):
        hrefcode = self.getTitle()
        pattern = re.compile('<a href="(http://.*?)".*?>(.*?)</a>', re.S)
        itmes = re.findall(pattern, hrefcode)
        return itmes
        
news = NewsTitle()
new = news.getHref()
# print new
res = res.gettitle(new)

# 入库         
#db = MySQLdb.connect("localhost","root","root","month11",charset="GBK")
#cursor = db.cursor()
# for item in new:
#     print item[0], news.tranTags(item[1])
#     urll=item[0]
#     vals=news.tranTags(item[1])
#     sql = """INSERT INTO title(title,url)VALUES (%s, %s)""" %("'"+vals+"'","'"+urll+"'")
#     try:       
#         cursor.execute(sql)   
#         db.commit()
#     except:
#         # Rollback in case there is any error
#         db.rollback()
#                


类库

#!D:/Python/python.exe
# -*- coding: UTF-8 -*-
#print "Content-type:textml"

import MySQLdb
import re
import ConfigParser
config = ConfigParser.ConfigParser()
# print dbhost,dbport,dbname,dbuser,dbpassword,dbcharset
#

class MysqldbHelper:
      def __init__(self):
                  config.read('db.conf')
                  dbhost = config.get("database", "dbhost")
                  dbport = config.get("database", "dbport")
                  dbname = config.get("database", "dbname")
                  dbuser = config.get("database", "dbuser")
                  dbpassword = config.get("database", "dbpassword")
                  dbcharset = config.get("database", "dbcharset")
                  conn=MySQLdb.connect(dbhost,dbuser,dbpassword,dbname)
                  self.cursor = conn.cursor()
      #单个删除
      def getdel(self,table,id):
                  cursor = self.cursor
                  try:
                        sql="DELETE FROM "+table+" WHERE id="+id
                        cursor.execute(sql)
                        return 'true'
                  except MySQLdb.Error as e:  
                        print ( "getdel Error %d: %s" % (e.args[0],e.args[1]) )
      
      def getguo(self, x):
                   pattern = re.compile('<div.*?</div>')
                   res = re.sub(pattern, '', x)
                   return res

      #查询表中所有数据
      def getselect(self,table):
                  cursor = self.cursor
                  try:
                        sql="SELECT * FROM "+table
                        cursor.execute(sql)
                        result = cursor.fetchall()
                        return result
                  except MySQLdb.Error as e:  
                        print ( "getdel Error %d: %s" % (e.args[0],e.args[1]) )
      
      def getdell(self,table,id):
                  cursor = self.cursor
                  try:
                        sql="DELETE FROM "+table+" WHERE id in ("+id+")"
                        cursor.execute(sql)
                        return 'true'
                  except MySQLdb.Error as e:  
                        print ( "getdel Error %d: %s" % (e.args[0],e.args[1]) )
      
      #添加标题 和 链接
      def gettitle(self,new):
                  res = MysqldbHelper()
                  cursor = self.cursor
                  add = new
                  for i in add:
                        print i[0],res.getguo(i[1])
                        val = res.getguo(i[1])
                        sql = """INSERT INTO title(title,url)VALUES (%s, %s)""" %("'"+val.decode('GB2312','ignore').encode('utf8')+"'","'"+i[0]+"'")  
                        cursor.execute(sql)
         


数据库

#配置数据库
[database]
dbhost=localhost
dbport=3306
dbname=month11
dbuser=root
dbpassword=root
dbcharset=utf8

采集页面

# -*- coding: UTF-8 -*-
import urllib
import urllib2
import re
import MySQLdb
print

class News:

    #init
    def __init__(self):
        self.url = "http://news.baidu.com/"

    #convert div to ''
    def tranTags(self, x):
        pattern = re.compile('<div.*?</div>')
        res = re.sub(pattern, '', x)
        return res

    #getPage
    def getPage(self):
        url = self.url
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        return response.read()

    #get navCode
    def getNavCode(self):
        page = self.getPage()
        pattern = re.compile('(<div id="menu".*?)<i class="slogan"></i>', re.S)
        navCode = re.search(pattern, page)
        return navCode.group(1)
        
    #get nav
    def getNav(self):
        navCode = self.getNavCode()
        pattern = re.compile('<a href="(http://.*?/).*?>(.*?)</a>', re.S)
        itmes = re.findall(pattern, navCode)
        return itmes
        # for item in itmes:
        #     print item[0], self.tranTags(item[1])  
# 入库         
db = MySQLdb.connect("localhost","root","root","month11",charset="GBK")
cursor = db.cursor()
news = News()
new = news.getNav()
for i in new:
    print i[0],news.tranTags(i[1])
    vals=news.tranTags(i[1])
    sql = """INSERT INTO aaa(name,url)VALUES (%s, %s)""" %("'"+vals+"'","'"+i[0]+"'")
    try:       
        cursor.execute(sql)   
        db.commit()
    except:
        # Rollback in case there is any error
        db.rollback()
               









评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值