python新闻标题分类_Python收集标题,python,采集,头条,文章

Author:song

from multiprocessing import Pool

from urllib.parse import urlencode

import requests

import json

from requests import RequestException

from bs4 import BeautifulSoup

import re

import pymongo

client = pymongo.MongoClient(‘localhost’,connect=False)

db = client[‘toutiaowenzhang’]

def get_index(offset):

data = {

‘offset’: offset,

‘format’: ‘json’,

‘keyword’: ‘美文’,

‘autoload’: ‘true’,

‘count’: 20,

‘cur_tab’: 1,

‘from’:‘search_tab’

}

url = ‘https://www.toutiao.com/search_content/?’+urlencode(data)

response = requests.get(url)

try:

if response.status_code == 200:

return response.text

else:

return None

except RequestException:

return None

def get_urls(html):

data = json.loads(html)

if data and ‘data’ in data.keys():

for item in data.get(‘data’):

yield item.get(‘article_url’)

def get_index_detail(url):

response = requests.get(url)

try:

if response.status_code == 200:

return response.text

else:

return None

except RequestException:

return None

def parse_detail(html):

try:

soup = BeautifulSoup(html,‘lxml’)

title = soup.select(‘title’)[0].get_text()

compile_allarticle= re.compile(‘content.

?

?)

’,re.S)

allarticle = re.findall(compile_allarticle,html)

# article =re.sub(’(<.*?)’,’’,allarticle[0])#正则匹配上不需要的那部分

article =re.sub(’[a-zA-Z0-9/#;&._]’,’’,str(allarticle)).strip()#直接把字母数字全部替换

data = {

‘title’:title,

‘article’:article

}

return data

except TypeError:#解决出现了404界面

pass

def save_to_mongodb(result):

if db[‘toutiaowenzhang’].insert(result):

print(‘successful’)

else:

print(‘fail’)

def main(offset):

html = get_index(offset)

items = get_urls(html)

for item in items:

if item:

ab = get_index_detail(item)

result = parse_detail(ab)

save_to_mongodb(result)

if

name

==‘

main

’:

groups = [x*20 for x in range(3)]

pool = Pool()

pool.map(main,groups)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值