#百度百家文章收集
importreimporturllib.requestimportpymysql.cursors#数据库配置参数
config ={‘host‘: ‘localhost‘,‘port‘: ‘3310‘,‘username‘: ‘woider‘,‘password‘: ‘3243‘,‘database‘: ‘python‘,‘charset‘: ‘utf8‘}#数据表创建语句
‘‘‘CREATE TABLE `news` (
`id` int(6) unsigned AUTO_INCREMENT NOT NULL,
`title` varchar(45) NOT NULL,
`author` varchar(12) NOT NULL,
`date` varchar(12) NOT NULL,
`about` varchar(255) NOT NULL,
`content` text NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;‘‘‘
#文章对象
classArticle(object):
title=None
author=None
date=None
about=None
content=Nonepass
#正则表达式
patArticle = ‘
patTitle = ‘
(.+)
‘ #匹配文章标题patAuthor = ‘
patDate = ‘(.+)‘ #匹配发布日期
patAbout = ‘
(.+)‘ #匹配文章简介
patContent = ‘
patCopy = ‘
patTag = ‘(
#文章信息
defcollect_article(url):
article=Article()
html= urllib.request.urlopen(url).read().decode(‘utf8‘)
article.title=re.findall(patTitle, html)[0]
article.author=re.findall(patAuthor, html)[0]
article.date=re.findall(patDate, html)[0]
article.about=re.findall(patAbout, html)[0]
content=re.findall(patContent, html)[0]
content= re.sub(patCopy, ‘‘, content[0])
content= re.sub(‘
‘, ‘\n‘, content)content= re.sub(patTag, ‘‘, content)
article.content=contentreturnarticle#储存信息
defsave_article(connect, article):
message=Nonetry:
cursor=connect.cursor()
sql= "INSERT INTO news (title, author, date, about, content) VALUES ( %s, %s, %s, %s, %s)"data=(article.title, article.author, article.date, article.about, article.content)
cursor.execute(sql, data)
connect.commit()exceptException as e:
message=str(e)else:
message=article.titlefinally:
cursor.close()returnmessage#抓取链接
home = ‘http://baijia.baidu.com/‘ #百度百家首页
html = urllib.request.urlopen(home).read().decode(‘utf8‘) #获取页面源码
links = re.findall(patArticle, html)[0:10] #每日热点新闻#连接数据库
connect =pymysql.connect(
host=config[‘host‘],
port=int(config[‘port‘]),
user=config[‘username‘],
passwd=config[‘password‘],
db=config[‘database‘],
charset=config[‘charset‘]
)for url inlinks:
article= collect_article(url) #收集文章信息
message = save_article(connect,article) #储存文章信息
print(message)passconnect.close()#关闭数据库连接