互联网热点自动获取工具的实现

最新推荐文章于 2023-05-09 21:40:45 发布

置顶东叔

最新推荐文章于 2023-05-09 21:40:45 发布

阅读量690

点赞数

分类专栏： Python 文章标签： python 爬虫数据获取自动获取学习

本文链接：https://blog.csdn.net/hora_bird/article/details/106354821

版权

Python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

目标：不用各个搜索网站去获取互联网热点，通过聚合式来显示当前网络上关注的热点，有利于我们节省时间（我们的时间都是很宝贵的~~）。

实现技术：Python（只使用此技术，大牛可以直接无视）

先来讲一下处理过程：

一、python 插件（不会安装的自行百度）

BeautifulSoup

二、爬虫处理基础类的封装（HttpUtils.py)

# -*- coding: utf-8 -*-
from urllib import request, parse
from urllib.parse import urlparse

from urllib.parse import quote
from urllib.parse import urlencode
import string
import re
import os,sys
from bs4 import BeautifulSoup

#多线程支持
import threading
#导入 time 包
import time

class HttpUtils(object):

    _baseUrl='';
    _siteUrl='';
    def __init__(self,siteUrl):
        #处理基准URL            
        o_urlparse=urlparse(siteUrl);
        baseUrl = o_urlparse.scheme+"://"+o_urlparse.netloc;
        bsObj=self.get_htmlReturnBS4(siteUrl);
        if bsObj is None:
            return None;
        o_base=bsObj.find_all('base', href=True);
        if(len(o_base)>0):
            baseUrl=o_base[0].attrs['href'];
        self._baseUrl=baseUrl;
        self._siteUrl=siteUrl;

    #获取HTML内容返回BS4格式对象
    def get_htmlReturnBS4(self,url,tryTime=5):
        statusCode=0;
        try:

            if url.strip()=='' :
                return None;
            url = quote(url,safe=string.printable)

             #如果不加上下面的这行出现会出现urllib2.HTTPError: HTTP Error 403: Forbidden错误
            #主要是由于该网站禁止爬虫导致的，可以在请求加上头信息，伪装成浏览器访问User-Agent,具体的信息可以通过火狐的FireBug插件查询
            headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
            req = request.Request(url, headers=headers)
            req.add_header('Referer', url)
            page=request.urlopen(req)
            statusCode=page.status;
            if(statusCode == 200):
                html_code = page.read()
                try:
                    bf=BeautifulSoup(html_code,'html.parser',from_encoding="gb2312")
                except BaseException as e:
                    bf=BeautifulSoup(html_code,'html.parser',from_encoding="ascii")                    
                return bf;
            return None;
        except BaseException as e:
            print('get_htmlReturnBS4(Exception:%s|url:|%s|statusCode:|%s)    \t'%(e,url,statusCode));
            if tryTime==0:
                return None;
            else:
                tryTime=tryTime-1;
                time.sleep(5);
                return self.get_htmlReturnBS4(url,tryTime);
            return None;

三、各个主要信息来源的处理(GetHotLines.py)

# coding:utf-8
from urllib import request, parse
from urllib.parse import unquote
import re
import os,sys
import bs4 
from HttpUtils import *
from bs4.element import Comment
from Utils import *
import uuid
import json

class GetHotLine(object):
    
    #获取百度热搜最新的内容，返回格式有List({title:XX,hotValue,url})
    def baiduHotSearch(self):
        url='https://www.baidu.com';
        hu=HttpUtils(url);
        html_bs4 = hu.get_htmlReturnBS4(url);
        ##print(html_bs4);

        #获取数据json
        hotsearch_json_str=html_bs4.find(id='hotsearch_data').text;
        #print("hotsearch_json_str:"+hotsearch_json_str)
        hotsearch_json = json.loads(hotsearch_json_str)
        hotsearchs=hotsearch_json["hotsearch"];
        l=[];
        for x in hotsearchs:
            pure_title=x['pure_title'];
            linkurl=unquote(x['linkurl']);
            views=x['pure_title'];
            isViewed=x['isViewed'];
            isNew=x['isNew'];
            heat_score=x['heat_score'];
            hotTags=x['hotTags'];
            l.append({"title":pure_title,"hotValue":heat_score,"url":linkurl});
            #print("{heat_score}-{pure_title} ".format(pure_title=pure_title,heat_score=heat_score));
        return l;

    #获取百度热议最新的内容，返回格式有List({title:XX,hotValue,url})
    def baiduHotTalk(self):
        l=[];
        url='http://tieba.baidu.com/hottopic/browse/topicList?res_type=1';
        hu=HttpUtils(url);
        html_bs4 = hu.get_htmlReturnBS4(url);
        #print(html_bs4);

        items=html_bs4.findAll('li',attrs= {'class':'topic-top-item'});
        #print(len(items))

        for item in items:
            topic_text_tag=item.findAll('a',attrs={'class':'topic-text'});
            topic_num_tag=item.findAll('span',attrs={'class':'topic-num'});
            l.append({"title":topic_text_tag[0].text,"hotValue":topic_num_tag[0].text,'url':topic_text_tag[0].get('href')});
        return l;

    #获取微博-热搜榜最新的内容，返回格式有List({title:XX,hotValue,url})
    def weiboHotSearch(self):
        l=[];
        url='https://s.weibo.com/top/summary?cate=realtimehot';
        hu=HttpUtils(url);
        html_bs4 = hu.get_htmlReturnBS4(url);
        #print(html_bs4);

        td_01=html_bs4.findAll('td',attrs= {'class':'td-01'});
        td_02=html_bs4.findAll('td',attrs= {'class':'td-02'});
        td_03=html_bs4.findAll('td',attrs= {'class':'td-03'});

        for i in range(0,len(td_01)):
            pn,keyword,times,linkUrl='','','',''
            pn=td_01[i].text.strip().replace('\n', '') 
            keyword=td_02[i].a.text.strip().replace('\n', '') 
            if td_02[i].span is None:
                times='0'
            else:
                times=td_02[i].span.text.strip().replace('\n', '') 

            if td_02[i].a is None:
                linkUrl='-'
            else:
                linkUrl="https://s.weibo.com/"+td_02[i].a.get('href') 
            l.append({"title":keyword,"hotValue":times,'url':linkUrl});
        return l;

    #获取微博-要闻榜最新的内容，返回格式有List({title:XX,hotValue,url})
    def weiboYW(self):
        l=[];
        #热搜榜
        url='https://s.weibo.com/top/summary?cate=socialevent';
        hu=HttpUtils(url);
        html_bs4 = hu.get_htmlReturnBS4(url);
        #print(html_bs4);

        td_02=html_bs4.findAll('td',attrs= {'class':'td-02'});

        for i in range(0,len(td_02)):
            pn,keyword,times,linkUrl='','','',''
            keyword=td_02[i].a.text.strip().replace('\n', '').replace('#', '') 
            linkUrl="https://s.weibo.com/"+td_02[i].a.get('href') 
            l.append({"title":keyword,"hotValue":"0",'url':linkUrl});
        return l;

    #获取知乎-热榜最新的内容，返回格式有List({title:XX,hotValue,url})
    def zh_rb(self):
        l=[];
        #热搜榜
        url='https://www.zhihu.com/billboard';
        hu=HttpUtils(url);
        html_bs4 = hu.get_htmlReturnBS4(url);
        #print(html_bs4);
        #获取数据json
        hot_list_json_str=html_bs4.find(id='js-initialData').text;
        #print("hot_list_json_str:"+hot_list_json_str)
        hot_list_json = json.loads(hot_list_json_str)
        hot_list=hot_list_json["initialState"]['topstory']['hotList'];

        for x in hot_list:
            title,hotValue,linkUrl='','',''
            title=x['target']['titleArea']['text']
            hotValue=x['target']['metricsArea']['text']
            linkUrl=x['target']['link']['url']
            l.append({"title":title,"hotValue":hotValue,'url':linkUrl});
        return l;

    #获取所有的热点信息为List
    def getAll2List(self):
        return {'baiduHotSearch':self.baiduHotSearch(),
                'baiduHotTalk':self.baiduHotTalk(),
                'weiboHotSearch':self.weiboHotSearch(),
                'weiboYW':self.weiboYW(),
                'zh_rb':self.zh_rb()}
    #获取所有的热点信息为HTML
    def getAll2HTML(self):
        html='''<html>
        <head>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
        </head>
        <body>
        <table>
            <tr>
                <td valign="top" style='background-color:LightSkyBlue;width:350px;'>
                    一、百度-热搜榜
                </td>
                <td valign="top" style='background-color:LightSkyBlue;width:350px;'>
                    二、百度-热议榜
                </td>

                <td valign="top" style='background-color:LightSkyBlue;width:350px;'>
                    三、新浪-热搜榜
                </td>
                <td valign="top" style='background-color:LightSkyBlue;width:350px;'>
                    四、新浪-要闻榜
                </td>
            </tr>
            <tr>
                <td valign="top">
                    <ui>{baiduHotSearch}</ui>
                </td>
                <td valign="top">
                    <ui>{baiduHotTalk}</ui>
                </td>

                <td valign="top"> 
                    <ui>{weiboHotSearch}</ui>
                </td>
                <td valign="top">
                    <ui>{weiboYW}</ui
                </td>
            </tr>

            <tr>
                <td colspan='4' valign="top" style='background-color:LightSkyBlue;'>
                    五、知乎-热榜
                </td>
            </tr>

            <tr>
                <td colspan='4' valign="top">
                    <ui>{zh_rb}</ui>
                </td>
            </tr>
        </table>
        </body>
        </html>
        '''
        baiduHotSearch_html=''
        baiduHotTalk_html=''
        weiboHotSearch_html=''
        weiboYW_html=''
        zh_rb_html=''

        template_span="<li><a style='color: black;font-size:8px;' target='_blank' href = '{url}'>{title} - <span style='color:red;'> {hotValue}</span></a> </li>";
        baiduHotSearch_list=self.baiduHotSearch();
        for item in baiduHotSearch_list:
            baiduHotSearch_html=baiduHotSearch_html+template_span.format(title=item["title"],hotValue=item["hotValue"],url=item["url"])

        baiduHotTalk_list=self.baiduHotTalk();
        for item in baiduHotTalk_list:
            baiduHotTalk_html=baiduHotTalk_html+template_span.format(title=item["title"],hotValue=item["hotValue"],url=item["url"])

        weiboHotSearch_list=self.weiboHotSearch();
        for item in weiboHotSearch_list:
            weiboHotSearch_html=weiboHotSearch_html+template_span.format(title=item["title"],hotValue=item["hotValue"],url=item["url"])

        weiboYW_list=self.weiboYW();
        for item in weiboYW_list:
            weiboYW_html=weiboYW_html+template_span.format(title=item["title"],hotValue=item["hotValue"],url=item["url"])

        zh_rb_list=self.zh_rb();
        for item in zh_rb_list:
            zh_rb_html=zh_rb_html+template_span.format(title=item["title"],hotValue=item["hotValue"],url=item["url"])

        #baiduHotTalk_list=baiduHotTalk();
        #weiboHotSearch_list=weiboHotSearch();
        #weiboYW_list=weiboYW();
        #zh_rb_list=zh_rb();

        html=html.format(baiduHotSearch=baiduHotSearch_html,
                    baiduHotTalk=baiduHotTalk_html,
                    weiboHotSearch=weiboHotSearch_html,
                    weiboYW=weiboYW_html,
                    zh_rb=zh_rb_html)
        #print(html)
        return html;

四、Web请求处理（httpResponse.py）

#coding=utf-8
from GetHotLines import *

#响应HTTP请求
class httpResponse(object) :
    _getHotline=None;

    def __init__(self):
        self._getHotline=GetHotLine();

    def handle_login(self,environ, start_response):
          start_response('200 OK', [('Content-Type', 'text/html')])
          return [b'login']

    def handle_hotline(self,environ, start_response):
          start_response('200 OK', [('Content-Type', 'text/html')])
          return [self._getHotline.getAll2HTML().encode('utf-8')];

    def deal(self,environ, start_response):
        method = environ['REQUEST_METHOD']
        path = environ['PATH_INFO']
    
        if method=='GET' and path=='/login':
            return self.handle_login(environ, start_response);

        if method=='GET' and path=='/hotline':
            return self.handle_hotline(environ, start_response);

        start_response('200 OK', [('Content-Type', 'text/html')])
        return [b'<h1>Hello, Python web!</h1>']

五、Web应用定义（myServer.py）

#coding=utf-8
#创建HTTP请求类
# 从wsgiref模块导入:
from wsgiref.simple_server import make_server
from httpResponse import httpResponse
class myServer(object):
    def run(self):
        app=httpResponse();
        # 创建一个服务器，IP地址为空，端口是8000，处理函数是application:
        httpd = make_server('', 8080, app.deal)
        print('Serving HTTP on port 8080...')
        # 开始监听HTTP请求:
        httpd.serve_forever()

六、Web应用启动（run.py）

#coding=utf-8
#测试服务
import myServer
s=myServer.myServer();
s.run();

七、代码结构

东叔

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
互联网热点自动获取工具的实现

目标：不用各个搜索网站去获取互联网热点，通过聚合式来显示当前网络上关注的热点，有利于我们节省时间（我们的时间都是很宝贵的~~）。实现技术：Python（只使用此技术，大牛可以直接无视）先来讲一下处理过程：实现效果图（不要吐槽）一、python 插件（不会安装的自行百度）BeautifulSoup二、爬虫处理基础类的封装（HttpUtils.py)# -*- coding: utf-8 -*-from urllib import request, pars.
复制链接

扫一扫

专栏目录