python爬取百度新闻_python采集百度新闻源并自动发布文章到phpcms

通过对phpcms数据库字段的填充,实现自动发布文章,手动发布一篇文章并查看数据库中那些table发生变化,即可发现cms(如帝国cms等)文章自动化发布工具开发的突破口!

# coding=utf-8

'''功能:采集百度新闻(https://news.baidu.com/)内容,百度新闻聚合了许多行业网站的新闻,已经帮我们去重筛选了,采集自己行业的新闻数据很不错。

主要思路:1,利用字典把各个网站的网址与正则及网页编码对应起来 2,把采集过得url放到一个文件中,判断是否采集过 3,百度新闻5分钟跟新一次,可以再建个程序每隔几分钟运行一次

'''

import pycurl,StringIO,json,urllib,urllib2,re

import MySQLdb

import time

from warnings import filterwarnings

import MySQLdb as Database

filterwarnings('ignore', category = Database.Warning)

import sys

reload(sys)

sys.setdefaultencoding('utf8')

headers = [

"User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",

"Cookie: spversion=20130314; historystock=603158%7C*%7C1A0001%7C*%7C000967%7C*%7C603328; Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1467682875,1467682943,1467682974,1468293176; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1468293226",

]

def curl(url):

c = pycurl.Curl()#通过curl方法构造一个对象

#c.setopt(pycurl.REFERER, 'https://qy.m.58.com/')#设置referer

c.setopt(pycurl.FOLLOWLOCATION, True)#自动进行跳转抓取

c.setopt(pycurl.MAXREDIRS,5)#设置最多跳转多少次

c.setopt(pycurl.CONNECTTIMEOUT, 60)#设置链接超时

c.setopt(pycurl.TIMEOUT,120)#下载超时

c.setopt(pycurl.ENCODING, 'gzip,deflate')

# c.setopt(c.PROXY,ip)# 代理

c.fp = StringIO.StringIO()

c.setopt(pycurl.URL, url)#设置要访问的URL

c.setopt(pycurl.HTTPHEADER,headers)#传入请求头

# c.setopt(pycurl.POST, 1)

# c.setopt(pycurl.POSTFIELDS, data)#传入POST数据

c.setopt(c.WRITEFUNCTION, c.fp.write)#回调写入字符串缓存

c.perform()

code = c.getinfo(c.HTTP_CODE)#返回状态码

html = c.fp.getvalue()#返回源代码

return html

# 通过正则提取元素

def search(req,html):

text = re.search(req,html)

if text:

data = text.group(1)

else:

data = 'no'

return data

# 去除文章url、多余标签等、补全路径等

def content_sort(content):

content = re.sub('

','

',content,flags=re.I)

content = re.sub('

','

',content)

content = re.sub('?span.*?>','',content)

content = re.sub('?a.*?>','',content)

content = re.sub('','',content)

content = re.sub('?img.*?>','',content,re.IGNORECASE)

content = re.sub('?IMG.*?>','',content,re.IGNORECASE)

content = re.sub('?div.*?>','',content,flags=re.I)

content = re.sub('?DIV.*?>','',content)

content = re.sub('?iframe.*?>','',content)

content = re.sub('?center.*?>','',content)

content = re.sub('?[fF].*?>','',content)

content = re.sub('[\s\S]*?','',content)

content = re.sub('?strong.*?>','',content)

content = re.sub('','',content)

content = re.sub('[\s\S]*?','',content)

content = re.sub(' ','',content)

content = re.sub(' ','',content)

content = re.sub(' ','',content)

return content

#域名与正则、编码对应表

req_dict = {

'finance.sina.com.cn':

{'title':'

(.*?)','content':'([\s\S]*?)','decode':'utf-8'},

'stock.eastmoney.com':

{'title':'

(.*?)','content':'
([\s\S]*?)
','decode':'gbk'},

'finance.eastmoney.com':

{'title':'

(.*?)','content':'
([\s\S]*?)
','decode':'gbk'},#ok

'guba.eastmoney.com':

{'title':'

(.*?)_.*?','content':'
([\s\S]*?)
','decode':'utf-8'},#ok

'stock.jrj.com.cn':

{'title':'

(.*?)-','content':'
([\s\S]*?)
','decode':'gbk'},

'hk.jrj.com.cn':

{'title':'

(.*?)-','content':'
([\s\S]*?)
','decode':'gbk'},

'hkstock.cnfol.com':

{'title':'

(.*?)_.*?','content':'
([\s\S]*?)','decode':'utf-8'},#ok

'sc.stock.cnfol.com':

{'title':'

(.*?)_.*?','content':'
([\s\S]*?)','decode':'utf-8'},#ok

'money.163.com':

{'title':'

(.*?)_.*?','content':'
([\s\S]*?)','decode':'utf-8'},

'www.chinastock.com.cn':

{'title':'

([\s\S]*?)
','content':'
([\s\S]*?)
','decode':'utf-8'},

'stock.huagu.com':

{'title':'

([\s\S]*?)

','content':'
([\s\S]*?)
','decode':'utf-8'},

'stock.sohu.com':

{'title':'

([\s\S]*?)

','content':'
([\s\S]*?)

'stock.cngold.org':

{'title':'

(.*?)-.*?','content':'
([\s\S]*?)
','decode':'utf-8'},

'hk.stock.hexun.com':

{'title':'

(.*?)[-_|].*?','content':'
([\s\S]*?)
','decode':'utf-8'},

'stock.gucheng.com':

{'title':'

(.*?)[-_|].*?','content':'
([\s\S]*?)
','decode':'utf-8'},

'www.cnstock.com':

{'title':'

(.*?)-.*?','content':'
([\s\S]*?)
','decode':'gbk'},

'www.ccstock.cn':

{'title':'

(.*?)-.*?','content':'
([\s\S]*?)
','decode':'utf-8'},

'news.emoney.cn':

{'title':'

(.*?)-.*?','content':'
([\s\S]*?)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值