采集页面
# -*- coding: UTF-8 -*-
print
from conf import *
import urllib
import urllib2
import MySQLdb
import re
res = MysqldbHelper()
class NewsTitle:
#init
def __init__(self):
self.url = "http://news.baidu.com/"
#convert div to ''
# def tranTags(self, x):
# pattern = re.compile('<div.*?</div>')
# pattern = re.compile('<img.*?>')
# res = re.sub(pattern, '', x)
# return res
# 去除span标签
# def clerSpan(self, x):
# pattern = re.compile('<span.*?</span>')
# res = re.sub(pattern, '', x)
# return res
# 去除JavaScript
# def clerjav(self, x):
# pattern = re.compile('javascript:.*?;')
# res = re.sub(pattern, '', x)
# return res
def getPage(self):
url = self.url
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read()
def getTitle(self):
page = self.getPage()
pattern = re.compile('(<div id="pane-news" .*?)<div id="footerwrapper">',re.S)
tit = re.search(pattern,page)
patterncode = re.sub(r'<a .*?><img .*?</a>','',tit.group(1))
patterncode = re.sub(r'<a .*?>\n<img .*?\n</a>','',patterncode)
return patterncode
def getHref(self):
hrefcode = self.getTitle()
pattern = re.compile('<a href="(http://.*?)".*?>(.*?)</a>', re.S)
itmes = re.findall(pattern, hrefcode)
return itmes
news = NewsTitle()
new = news.getHref()
# print new
res = res.gettitle(new)
# 入库
#db = MySQLdb.connect("localhost","root","root","month11",charset="GBK")
#cursor = db.cursor()
# for item in new:
# print item[0], news.tranTags(item[1])
# urll=item[0]
# vals=news.tranTags(item[1])
# sql = """INSERT INTO title(title,url)VALUES (%s, %s)""" %("'"+vals+"'","'"+urll+"'")
# try:
# cursor.execute(sql)
# db.commit()
# except:
# # Rollback in case there is any error
# db.rollback()
#
类库
#!D:/Python/python.exe
# -*- coding: UTF-8 -*-
#print "Content-type:textml"
import MySQLdb
import re
import ConfigParser
config = ConfigParser.ConfigParser()
# print dbhost,dbport,dbname,dbuser,dbpassword,dbcharset
#
class MysqldbHelper:
def __init__(self):
config.read('db.conf')
dbhost = config.get("database", "dbhost")
dbport = config.get("database", "dbport")
dbname = config.get("database", "dbname")
dbuser = config.get("database", "dbuser")
dbpassword = config.get("database", "dbpassword")
dbcharset = config.get("database", "dbcharset")
conn=MySQLdb.connect(dbhost,dbuser,dbpassword,dbname)
self.cursor = conn.cursor()
#单个删除
def getdel(self,table,id):
cursor = self.cursor
try:
sql="DELETE FROM "+table+" WHERE id="+id
cursor.execute(sql)
return 'true'
except MySQLdb.Error as e:
print ( "getdel Error %d: %s" % (e.args[0],e.args[1]) )
def getguo(self, x):
pattern = re.compile('<div.*?</div>')
res = re.sub(pattern, '', x)
return res
#查询表中所有数据
def getselect(self,table):
cursor = self.cursor
try:
sql="SELECT * FROM "+table
cursor.execute(sql)
result = cursor.fetchall()
return result
except MySQLdb.Error as e:
print ( "getdel Error %d: %s" % (e.args[0],e.args[1]) )
def getdell(self,table,id):
cursor = self.cursor
try:
sql="DELETE FROM "+table+" WHERE id in ("+id+")"
cursor.execute(sql)
return 'true'
except MySQLdb.Error as e:
print ( "getdel Error %d: %s" % (e.args[0],e.args[1]) )
#添加标题 和 链接
def gettitle(self,new):
res = MysqldbHelper()
cursor = self.cursor
add = new
for i in add:
print i[0],res.getguo(i[1])
val = res.getguo(i[1])
sql = """INSERT INTO title(title,url)VALUES (%s, %s)""" %("'"+val.decode('GB2312','ignore').encode('utf8')+"'","'"+i[0]+"'")
cursor.execute(sql)
数据库
#配置数据库
[database]
dbhost=localhost
dbport=3306
dbname=month11
dbuser=root
dbpassword=root
dbcharset=utf8
采集页面
# -*- coding: UTF-8 -*-
import urllib
import urllib2
import re
import MySQLdb
print
class News:
#init
def __init__(self):
self.url = "http://news.baidu.com/"
#convert div to ''
def tranTags(self, x):
pattern = re.compile('<div.*?</div>')
res = re.sub(pattern, '', x)
return res
#getPage
def getPage(self):
url = self.url
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read()
#get navCode
def getNavCode(self):
page = self.getPage()
pattern = re.compile('(<div id="menu".*?)<i class="slogan"></i>', re.S)
navCode = re.search(pattern, page)
return navCode.group(1)
#get nav
def getNav(self):
navCode = self.getNavCode()
pattern = re.compile('<a href="(http://.*?/).*?>(.*?)</a>', re.S)
itmes = re.findall(pattern, navCode)
return itmes
# for item in itmes:
# print item[0], self.tranTags(item[1])
# 入库
db = MySQLdb.connect("localhost","root","root","month11",charset="GBK")
cursor = db.cursor()
news = News()
new = news.getNav()
for i in new:
print i[0],news.tranTags(i[1])
vals=news.tranTags(i[1])
sql = """INSERT INTO aaa(name,url)VALUES (%s, %s)""" %("'"+vals+"'","'"+i[0]+"'")
try:
cursor.execute(sql)
db.commit()
except:
# Rollback in case there is any error
db.rollback()