python爬取百度新闻_python采集百度新闻源并自动发布文章到phpcms

最新推荐文章于 2022-05-13 11:54:37 发布

weixin_39559804

最新推荐文章于 2022-05-13 11:54:37 发布

阅读量346

点赞数

文章标签： python爬取百度新闻

通过对phpcms数据库字段的填充，实现自动发布文章，手动发布一篇文章并查看数据库中那些table发生变化，即可发现cms（如帝国cms等）文章自动化发布工具开发的突破口！

# coding=utf-8

'''功能：采集百度新闻(https://news.baidu.com/)内容,百度新闻聚合了许多行业网站的新闻,已经帮我们去重筛选了,采集自己行业的新闻数据很不错。

主要思路：1,利用字典把各个网站的网址与正则及网页编码对应起来 2,把采集过得url放到一个文件中，判断是否采集过 3,百度新闻5分钟跟新一次,可以再建个程序每隔几分钟运行一次

'''

import pycurl,StringIO,json,urllib,urllib2,re

import MySQLdb

import time

from warnings import filterwarnings

import MySQLdb as Database

filterwarnings('ignore', category = Database.Warning)

import sys

reload(sys)

sys.setdefaultencoding('utf8')

headers = [

"User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",

"Cookie: spversion=20130314; historystock=603158%7C*%7C1A0001%7C*%7C000967%7C*%7C603328; Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1467682875,1467682943,1467682974,1468293176; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1468293226",

]

def curl(url):

c = pycurl.Curl()#通过curl方法构造一个对象

#c.setopt(pycurl.REFERER, 'https://qy.m.58.com/')#设置referer

c.setopt(pycurl.FOLLOWLOCATION, True)#自动进行跳转抓取

c.setopt(pycurl.MAXREDIRS,5)#设置最多跳转多少次

c.setopt(pycurl.CONNECTTIMEOUT, 60)#设置链接超时

c.setopt(pycurl.TIMEOUT,120)#下载超时

c.setopt(pycurl.ENCODING, 'gzip,deflate')

# c.setopt(c.PROXY,ip)# 代理

c.fp = StringIO.StringIO()

c.setopt(pycurl.URL, url)#设置要访问的URL

c.setopt(pycurl.HTTPHEADER,headers)#传入请求头

# c.setopt(pycurl.POST, 1)

# c.setopt(pycurl.POSTFIELDS, data)#传入POST数据

c.setopt(c.WRITEFUNCTION, c.fp.write)#回调写入字符串缓存

c.perform()

code = c.getinfo(c.HTTP_CODE)#返回状态码

html = c.fp.getvalue()#返回源代码

return html

# 通过正则提取元素

def search(req,html):

text = re.search(req,html)

if text:

data = text.group(1)

else:

data = 'no'

return data

# 去除文章url、多余标签等、补全路径等

def content_sort(content):

content = re.sub('

','

',content,flags=re.I)

content = re.sub('

','

',content)

content = re.sub('?span.*?>','',content)

content = re.sub('?a.*?>','',content)

content = re.sub('','',content)

content = re.sub('?img.*?>','',content,re.IGNORECASE)

content = re.sub('?IMG.*?>','',content,re.IGNORECASE)

content = re.sub('?div.*?>','',content,flags=re.I)

content = re.sub('?DIV.*?>','',content)

content = re.sub('?iframe.*?>','',content)

content = re.sub('?center.*?>','',content)

content = re.sub('?[fF].*?>','',content)

content = re.sub('[\s\S]*?','',content)

content = re.sub('?strong.*?>','',content)

content = re.sub('','',content)

content = re.sub('[\s\S]*?','',content)

content = re.sub(' ','',content)

return content

#域名与正则、编码对应表

req_dict = {

'finance.sina.com.cn':

{'title':'

(.*?)','content':'([\s\S]*?)','decode':'utf-8'},

'stock.eastmoney.com':

{'title':'

(.*?)','content':'

([\s\S]*?)

','decode':'gbk'},

'finance.eastmoney.com':

{'title':'

(.*?)','content':'

([\s\S]*?)

','decode':'gbk'},#ok

'guba.eastmoney.com':

{'title':'

(.*?)_.*?','content':'

([\s\S]*?)

','decode':'utf-8'},#ok

'stock.jrj.com.cn':

{'title':'

(.*?)-','content':'

([\s\S]*?)

','decode':'gbk'},

'hk.jrj.com.cn':

{'title':'

(.*?)-','content':'

([\s\S]*?)

','decode':'gbk'},

'hkstock.cnfol.com':

{'title':'

(.*?)_.*?','content':'

([\s\S]*?)','decode':'utf-8'},#ok

'sc.stock.cnfol.com':

{'title':'

(.*?)_.*?','content':'

([\s\S]*?)','decode':'utf-8'},#ok

'money.163.com':

{'title':'

(.*?)_.*?','content':'

([\s\S]*?)','decode':'utf-8'},

'www.chinastock.com.cn':

{'title':'

([\s\S]*?)

','content':'

([\s\S]*?)

','decode':'utf-8'},

'stock.huagu.com':

{'title':'

([\s\S]*?)

','content':'

([\s\S]*?)

','decode':'utf-8'},

'stock.sohu.com':

{'title':'

([\s\S]*?)

','content':'

([\s\S]*?)

'stock.cngold.org':

{'title':'

(.*?)-.*?','content':'

([\s\S]*?)

','decode':'utf-8'},

'hk.stock.hexun.com':

{'title':'

(.*?)[-_|].*?','content':'

([\s\S]*?)

','decode':'utf-8'},

'stock.gucheng.com':

{'title':'

(.*?)[-_|].*?','content':'

([\s\S]*?)

','decode':'utf-8'},

'www.cnstock.com':

{'title':'

(.*?)-.*?','content':'

([\s\S]*?)

','decode':'gbk'},

'www.ccstock.cn':

{'title':'

(.*?)-.*?','content':'

([\s\S]*?)

','decode':'utf-8'},

'news.emoney.cn':

{'title':'

(.*?)-.*?','content':'

([\s\S]*?)

','decode':'utf-8'},

'finance.ce.cn':

{'title':'

(.*?)','content':'

([\s\S]*?)

'www.p5w.net':

{'title':'

(.*?)[_-|].*?','content':'

([\s\S]*?)

','decode':'gbk'},

'www.nbd.com.cn':

{'title':'

(.*?)[_-|][\s\S]*?','content':'

([\s\S]*?)

','decode':'gbk'},

'stock.caijing.com.cn':

{'title':'

(.*?)[-_|].*?','content':'

([\s\S]*?)

}

def id():

'''获取标题对应id,构建url.我用的是phpcms,前台显示需将url写入数据库'''

con = MySQLdb.connect('localhost','root','','phpcmsv9',charset='utf8')

with con:

cur = con.cursor()

cur.execute("select id from v9_news where title = title")

numrows = int(cur.rowcount)

return numrows+1

def CmsSQL(title,content):

'''写入数据,如何将多个数据写入数据库可参考'''

value1 = []

value1.append(content)

value1.append(idnum)

value2 = []

value2.append(title)

value2.append(urlid)

value2.append(int(time.time()))

db = MySQLdb.connect('localhost','root','','phpcmsv9',charset='utf8')

cursor = db.cursor()

cursor.execute("insert into v9_news_data (content,id) values(%s,%s)" ,value1)

cursor.execute("insert into v9_news(title,catid,typeid,url,inputtime,updatetime) values(%s,6,0,%s,%s,%s)",value2)

db.commit()

db.close()

url = 'https://news.baidu.com/n?cmd=4&class=gegu&tn=rss'

urls = re.findall(r'',curl(url))

urls.reverse()

for url in urls:

with open('urls.txt') as f1 :

if url not in f1.read(): #判断url是否采集过

url.strip()

f1.close()

line = url.split('/')[2]

if req_dict.has_key(line): #通过键位是否存在判断这个网站是否写好的正则

time.sleep(1)

try:

title = search(req_dict[line]['title'],curl(url)).decode(req_dict[line]['decode']) #网址与正则及网页编码对应起来

content = url + search(req_dict[line]['content'],curl(url)).decode(req_dict[line]['decode'])

except:

continue

urlid = 'https://localhost/index.php?m=content&c=index&a=show&catid=6&id=%s' %id()

idnum = int(id())

print id(),content_sort(title)

CmsSQL(content_sort(title),content_sort(content))

f1w =open('urls.txt','a+')

f1w.write(url+'\n')

f1w.close()

else:

print u'正则不存在'

open('requrl','a+').write(url+'\n')

else:

print u'此url在列表中:'

开发环境python2.x，脚本是一个小哥哥给的，忘了…刷抖音去了！

weixin_39559804

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫