今日头条新闻爬取+storm流处理存储（1）——爬取部分

最新推荐文章于 2024-08-14 00:24:32 发布

龙之焱影

最新推荐文章于 2024-08-14 00:24:32 发布

阅读量1.4k

点赞数

分类专栏： storm 文章标签： python kafka storm

本文链接：https://blog.csdn.net/s863222424/article/details/103817795

版权

storm 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

项目简介

本项目整体分为三个部分来进行

今日头条新闻爬取
将爬取下来的新闻正文部分进行实体分析，并将结果可视化
用storm框架将爬取的新闻数据存入mysql

本文主要介绍今日头条新闻爬取的部分,下面给出整个项目的框架
在这里插入图片描述
由于下面可能涉及到kafka相关的东西，关于这部分内容可以参考这篇文章：流处理平台的搭建
实体分析部分可以参考：实体分析
storm流处理部分可以参考：storm流处理
项目下载地址：今日头条爬取+实体分析+storm流处理

代码介绍

main.py
程序的启动入口，这里为了能够让程序不断的执行下去用了个死循环

# -- coding: utf-8 -

from get_page_index import get_page_index
import time

def main():
    get_page_index()

if __name__ == "__main__":
    a=1
    while True:
        print('<----------------------第'+str(a)+'轮启动---------------------->\n')
        main()
        print('<----------------------第' + str(a) + '轮结束---------------------->\n')
        print('<----------------------进入休眠---------------------->\n')
        time.sleep(300)
        a+=1

get_page_index.py
这个函数主要起到一个承上启下的中转的作用

import time
import random
from paser_page_index import paser_page_index
from get_ip_index import get_ip_index
from get_page_detail import get_page_detail

def get_page_index():
    print('-----------------开始获取网页列表-----------------')
    list=paser_page_index()
    print('-----------------获取网页列表结束-----------------\n')
    print('-----------------开始获取代理IP-----------------')
    ip_list=get_ip_index()

    print('-----------------开始爬取网页-----------------')
    for i in list:
        url='https://www.toutiao.com/a'+i
        a=get_page_detail(url,ip_list)
        time.sleep(random.randint(3, 5))
        # if a==0:
        #     print('这是一条广告或者无法解析该网页')
        # if a==1:
        #     print('这篇文章是个问答')
        # if a==2:
        #     print('这是一个图片类文章')
    print('-----------------爬取网页结束-----------------\n')

paser_page_index.py
这个函数的作用是获取我们需要爬取的网页的列表，以便于对列表内的网页进行爬取

import time
import requests
from bs4 import BeautifulSoup
import hashlib

#这个函数是用来计算今日头条加密算法的结果
def get_as_cp_args():
    zz = {}
    now = round(time.time())
    e = hex(int(now)).upper()[2:]  # hex()转换一个整数对象为十六进制的字符串表示
    i = hashlib.md5(str(int(now)).encode("utf8")).hexdigest().upper()  # hashlib.md5().hexdigest()创建hash对象并返回16进制结果
    if len(e) != 8:
        zz = {'as': "479BB4B7254C150",
              'cp': "7E0AC8874BB0985"}
        return zz
    n = i[:5]
    a = i[-5:]
    r = ""
    s = ""
    for i in range(5):
        s = s + n[i] + e[i]
    for j in range(5):
        r = r + e[j + 3] + a[j]
    zz = {
        'as': "A1" + s + e[-3:],
        'cp': e[0:3] + r + "E1"
    }
    return zz


def paser_page_index():
    url1 = [
           'https://www.toutiao.com/api/pc/feed/?category=news_hot',
           'https://www.toutiao.com/api/pc/feed/?category=news_tech',
           'https://www.toutiao.com/api/pc/feed/?category=news_entertainment',
           'https://www.toutiao.com/api/pc/feed/?category=news_game',
           'https://www.toutiao.com/api/pc/feed/?category=news_sports',
           'https://www.toutiao.com/api/pc/feed/?category=news_car',
           'https://www.toutiao.com/api/pc/feed/?category=news_finance',
           'https://www.toutiao.com/api/pc/feed/?category=funny',
           'https://www.toutiao.com/api/pc/feed/?category=news_military',
           'https://www.toutiao.com/api/pc/feed/?category=news_world',
           'https://www.toutiao.com/api/pc/feed/?category=news_fashion',
           'https://www.toutiao.com/api/pc/feed/?category=news_travel',
           'https://www.toutiao.com/api/pc/feed/?category=news_discovery',
           'https://www.toutiao.com/api/pc/feed/?category=news_baby',
           'https://www.toutiao.com/api/pc/feed/?category=news_regimen',
           'https://www.toutiao.com/api/pc/feed/?category=news_essay',
           'https://www.toutiao.com/api/pc/feed/?category=news_history',
           'https://www.toutiao.com/api/pc/feed/?category=news_food'
            ]
    list=[]
    for i in url1:
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
        cookie = 'tt_webid=6762050087801406989; tt_webid=6762050087801406989; csrftoken=be4be279678742cea85ca2bfc0b308c8; WEATHER_CITY=%E5%8C%97%E4%BA%AC; s_v_web_id=c05bab65d1f25e1c6b72817b6f34f92a; __tasessionId=lgnzbs4ah1578017571495'
        headers = {'user-agent': user_agent, 'cookie': cookie,'referer': i}
        as_cp=get_as_cp_args()
        url2 = '&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as='+as_cp['as']+'&cp='+as_cp['cp']
        respond = requests.get(i + url2, headers=headers)
        soup = BeautifulSoup(respond.text, 'html.parser')
        print(soup)
        try:
            if respond.status_code == 200:
                dict1 = respond.json()
            for i in dict1['data']:
                list.append(i['group_id'])
        except:
            None
    return list

get_ip_index.py
这个程序是用来进行代理IP的爬取的，返回的结果是可用的代理IP，主要是由于我们需要长时间的对新闻进行爬取，一旦爬取时间过长，很容易IP就被封了，所以需要用到代理IP

import requests
from bs4 import BeautifulSoup
import random

def get_ip_index():
    randomlist=['/nn/','/wn/','/wt/']
    url='https://www.xicidaili.com'+random.choice(randomlist)+str(random.randint(1,3))
    print('代理IP来源网址：',url)
    list=[]
    proxies={}
    start=random.randint(1,40)
    end=random.randint(50,90)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
    r=requests.get(url,headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    tag=soup.find_all('tr')
    for j in tag[start:end]:
        tag1=j.find_all('td')
        list.append(tag1[1].text+':'+tag1[2].text)
    # 这部分是用来验证代理IP是否可用的，可以加上，也可以不加
    # for i in list:
    #     try:
    #         ip="https://" + i
    #         # print(ip)
    #         proxies['https']=ip
    #         r=requests.get('https://www.baidu.com',headers=headers,proxies=proxies,timeout=(3,7))
    #     except:
    #         list.remove(i)
    print('-----------------成功获得代理' + str(len(list)) + '个-----------------\n')
    return list

get_page_detail.py
这个函数是爬取今日头条的主体函数

import requests
from bs4 import BeautifulSoup
import re
from my_kafka import kafka_produce
from get_ip_index import get_ip_index
from get_article import get_article
from text_grapher import Entity_extraction

def get_page_detail(url,ip_list):
    proxies = {}
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    headers = {'user-agent': user_agent, 'x-requested-with': 'XMLHttpRequest'}
    print('当前获取网页：',url)
    while True:
        if proxies:
            try:
                r = requests.get(url,headers=headers,allow_redirects=False,proxies=proxies,timeout=(3,7))
                if r.status_code == 200:
                    break
            except:
                proxies['https'] = 'https://' + ip_list[0]
                ip_list.pop(0)
                if ip_list==[]:
                    ip_list=get_ip_index()
        else:
            r = requests.get(url, headers=headers, allow_redirects=False, timeout=(3, 7))
            if r.status_code == 200:
                break
            else:
                proxies['https'] = 'https://' + ip_list[0]
                ip_list.pop(0)
    r.encoding = 'utf-8'
    article = {}
    article['url']=url
    soup = BeautifulSoup(r.text, 'html.parser')
    # print(soup.prettify())
    Str = soup.text
    try:
    	type=re.findall(re.compile(r'chineseTag: \'(.*?)\'', re.S), Str)
    except:
    	return 0
    if type==[] or type==['']:
        return 0
    if type == ['问答']:
        return 1
    if type == ['图片']:
        return 2
    #类别
    article['type']=re.findall(re.compile(r'chineseTag: \'(.*?)\'', re.S), Str)[0]
    #标题
    title_result=re.findall(re.compile(r'title: \'(.*?)\'', re.S), Str)[0]
    title=re.sub(r'[\\/:*?"<>|]', '', title_result)
    article['title']=title
    # 时间，评论次数，来源，封面图片，关键词
    article['time'] = re.findall(re.compile(r'time: \'(.*?)\'', re.S), Str)[0]
    article['comments_count']=re.findall(re.compile(r'comments_count: (.*?),', re.S), Str)[0]
    article['source']=re.findall(re.compile(r'source: \'(.*?)\'', re.S), Str)[0]
    article['coverImg']=re.findall(re.compile(r'coverImg: \'(.*?)\'', re.S), Str)[0]
    article['keywords']=re.findall(re.compile(r'{"name":\"(.*?)\"}', re.S), Str)
    keywords=''
    for i in re.findall(re.compile(r'{"name":\"(.*?)\"}', re.S), Str):
        keywords=keywords+i+'\t'
    #正文
    text=get_article(r)
    article['news']=text
    #kafka通信
    kafka_produce(str([article]).replace("&quot;",""),url)
    #实体分析
    Entity_extraction(text,title.replace("&quot;",""))
    # print(article)

get_article.py
这个函数是爬取今日头条新闻正文部分的函数，主要由于今日头条的正文部分掺杂了很多网页的标签需要去掉，以及我们想要做到整个正文部分的文字连同图片的顺序不能乱，实现起来会有点复杂

from bs4 import BeautifulSoup
import re

def get_article(response):
    string=''
    soup = BeautifulSoup(response.text , features="lxml")
    # .encode('utf-8').decode("unicode-escape")
    body = soup.find('body')
    script4 = body.find_all('script')

    rawMeterial = re.findall("articleInfo:([\s\S]*)tagInfo:", str(script4[3])[23:][:-10])[0]

    pipeiArticle = "content: '([\s\S]*)groupId:"
    Article = re.findall(pipeiArticle, rawMeterial)

    # print(Article)

    a = Article[0].strip()
    b = a.split(r'\u003Cp\u003E')
    for each in b:
        each2 = each.replace(r'\u003C','<').replace(r'p\u003E','p>').replace(r'\u002F','\\').replace(r'\u003E','>')
        if '<\p>' in each2:
            # print(each2.index('<\p>'))
            each3 = each2[:each2.index('<\p>')].strip()
        # print(re.sub(re.compile("<\\p>(.*?)"), "", each2))
            each4 = re.sub(re.compile("<(.*?)>"), "", each3)
            # print(re.sub(re.compile("<(.*?)>"), "", each3))
            string=string+each4+'\n'
            # print(each4)

        pipeiSource = "<img src([\s\S]*)\&quot; img_width"
        pipeiSource2 = "http:([\s\S]*)"
        source2 = re.findall(pipeiSource, each2)
        # print(each2)
        # print(source2)
        if source2 != []:
            # print(source2)
            source3 = source2[0].split('\&quot; img_width')
            # print(source3)
            for each in source3:
                source4 = re.findall(pipeiSource2, each)
                # print('http:' + source4[0])
                string = string + str('http:' + source4[0]).strip() + '\n'
            # print(source2[0][13:][:-1].strip())
        # print('\n')

        # pipeiSource = "<img src([\s\S]*)\&quot; img_width"
        # source2 = re.findall(pipeiSource, each2)
        # if source2 != []:
        #     string=string+source2[0][13:][:-1].strip()+'\n'
        #     # print(source2[0][13:][:-1].strip())
    return string.replace("&quot;","")

my_kafka.py
这个程序是用来连接kafka的，用来将我们爬取的新闻发送到storm内

# -*- coding: utf-8 -*-

from kafka import KafkaProducer
from kafka.errors import KafkaError

KAFAKA_HOST="192.168.161.100"
KAFAKA_PORT = 9092  # 端口号
KAFAKA_TOPIC = "today_news"  # topic

class Kafka_producer():
    def __init__(self, kafkahost, kafkaport, kafkatopic):
        self.kafkaHost = kafkahost
        self.kafkaPort = kafkaport
        self.kafkatopic = kafkatopic
        self.producer = KafkaProducer(bootstrap_servers='{kafka_host}:{kafka_port}'.format(
            kafka_host=self.kafkaHost,
            kafka_port=self.kafkaPort)
        )

    def sendjsondata(self, params):
        try:
            parmas_message = params  # 注意dumps
            producer = self.producer
            producer.send(self.kafkatopic, value=parmas_message.encode('utf-8'))
            producer.flush()
        except KafkaError as e:
            print(e)

def kafka_produce(params,url):
    # 生产模块
    producer = Kafka_producer(KAFAKA_HOST, KAFAKA_PORT, KAFAKA_TOPIC)
    print("======> producer:", url, '\n')
    producer.sendjsondata(params)