目标:不用各个搜索网站去获取互联网热点,通过聚合式来显示当前网络上关注的热点,有利于我们节省时间(我们的时间都是很宝贵的~~)。
实现技术:Python(只使用此技术,大牛可以直接无视)
先来讲一下处理过程:
一、python 插件(不会安装的自行百度)
BeautifulSoup
二、爬虫处理基础类的封装(HttpUtils.py)
# -*- coding: utf-8 -*-
from urllib import request, parse
from urllib.parse import urlparse
from urllib.parse import quote
from urllib.parse import urlencode
import string
import re
import os,sys
from bs4 import BeautifulSoup
#多线程支持
import threading
#导入 time 包
import time
class HttpUtils(object):
_baseUrl='';
_siteUrl='';
def __init__(self,siteUrl):
#处理基准URL
o_urlparse=urlparse(siteUrl);
baseUrl = o_urlparse.scheme+"://"+o_urlparse.netloc;
bsObj=self.get_htmlReturnBS4(siteUrl);
if bsObj is None:
return None;
o_base=bsObj.find_all('base', href=True);
if(len(o_base)>0):
baseUrl=o_base[0].attrs['href'];
self._baseUrl=baseUrl;
self._siteUrl=siteUrl;
#获取HTML内容返回BS4格式对象
def get_htmlReturnBS4(self,url,tryTime=5):
statusCode=0;
try:
if url.strip()=='' :
return None;
url = quote(url,safe=string.printable)
#如果不加上下面的这行出现会出现urllib2.HTTPError: HTTP Error 403: Forbidden错误
#主要是由于该网站禁止爬虫导致的,可以在请求加上头信息,伪装成浏览器访问User-Agent,具体的信息可以通过火狐的FireBug插件查询
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
req = request.Request(url, headers=headers)
req.add_header('Referer', url)
page=request.urlopen(req)
statusCode=page.status;
if(statusCode == 200):
html_code = page.read()
try:
bf=BeautifulSoup(html_code,'html.parser',from_encoding="gb2312")
except BaseException as e:
bf=BeautifulSoup(html_code,'html.parser',from_encoding="ascii")
return bf;
return None;
except BaseException as e:
print('get_htmlReturnBS4(Exception:%s|url:|%s|statusCode:|%s) \t'%(e,url,statusCode));
if tryTime==0:
return None;
else:
tryTime=tryTime-1;
time.sleep(5);
return self.get_htmlReturnBS4(url,tryTime);
return None;
三、各个主要信息来源的处理(GetHotLines.py)
# coding:utf-8
from urllib import request, parse
from urllib.parse import unquote
import re
import os,sys
import bs4
from HttpUtils import *
from bs4.element import Comment
from Utils import *
import uuid
import json
class GetHotLine(object):
#获取百度热搜最新的内容,返回格式有List({title:XX,hotValue,url})
def baiduHotSearch(self):
url='https://www.baidu.com';
hu=HttpUtils(url);
html_bs4 = hu.get_htmlReturnBS4(url);
##print(html_bs4);
#获取数据json
hotsearch_json_str=html_bs4.find(id='hotsearch_data').text;
#print("hotsearch_json_str:"+hotsearch_json_str)
hotsearch_json = json.loads(hotsearch_json_str)
hotsearchs=hotsearch_json["hotsearch"];
l=[];
for x in hotsearchs:
pure_title=x['pure_title'];
linkurl=unquote(x['linkurl']);
views=x['pure_title'];
isViewed=x['isViewed'];
isNew=x['isNew'];
heat_score=x['heat_score'];
hotTags=x['hotTags'];
l.append({"title":pure_title,"hotValue":heat_score,"url":linkurl});
#print("{heat_score}-{pure_title} ".format(pure_title=pure_title,heat_score=heat_score));
return l;
#获取百度热议最新的内容,返回格式有List({title:XX,hotValue,url})
def baiduHotTalk(self):
l=[];
url='http://tieba.baidu.com/hottopic/browse/topicList?res_type=1';
hu=HttpUtils(url);
html_bs4 = hu.get_htmlReturnBS4(url);
#print(html_bs4);
items=html_bs4.findAll('li',attrs= {'class':'topic-top-item'});
#print(len(items))
for item in items:
topic_text_tag=item.findAll('a',attrs={'class':'topic-text'});
topic_num_tag=item.findAll('span',attrs={'class':'topic-num'});
l.append({"title":topic_text_tag[0].text,"hotValue":topic_num_tag[0].text,'url':topic_text_tag[0].get('href')});
return l;
#获取微博-热搜榜最新的内容,返回格式有List({title:XX,hotValue,url})
def weiboHotSearch(self):
l=[];
url='https://s.weibo.com/top/summary?cate=realtimehot';
hu=HttpUtils(url);
html_bs4 = hu.get_htmlReturnBS4(url);
#print(html_bs4);
td_01=html_bs4.findAll('td',attrs= {'class':'td-01'});
td_02=html_bs4.findAll('td',attrs= {'class':'td-02'});
td_03=html_bs4.findAll('td',attrs= {'class':'td-03'});
for i in range(0,len(td_01)):
pn,keyword,times,linkUrl='','','',''
pn=td_01[i].text.strip().replace('\n', '')
keyword=td_02[i].a.text.strip().replace('\n', '')
if td_02[i].span is None:
times='0'
else:
times=td_02[i].span.text.strip().replace('\n', '')
if td_02[i].a is None:
linkUrl='-'
else:
linkUrl="https://s.weibo.com/"+td_02[i].a.get('href')
l.append({"title":keyword,"hotValue":times,'url':linkUrl});
return l;
#获取微博-要闻榜最新的内容,返回格式有List({title:XX,hotValue,url})
def weiboYW(self):
l=[];
#热搜榜
url='https://s.weibo.com/top/summary?cate=socialevent';
hu=HttpUtils(url);
html_bs4 = hu.get_htmlReturnBS4(url);
#print(html_bs4);
td_02=html_bs4.findAll('td',attrs= {'class':'td-02'});
for i in range(0,len(td_02)):
pn,keyword,times,linkUrl='','','',''
keyword=td_02[i].a.text.strip().replace('\n', '').replace('#', '')
linkUrl="https://s.weibo.com/"+td_02[i].a.get('href')
l.append({"title":keyword,"hotValue":"0",'url':linkUrl});
return l;
#获取知乎-热榜最新的内容,返回格式有List({title:XX,hotValue,url})
def zh_rb(self):
l=[];
#热搜榜
url='https://www.zhihu.com/billboard';
hu=HttpUtils(url);
html_bs4 = hu.get_htmlReturnBS4(url);
#print(html_bs4);
#获取数据json
hot_list_json_str=html_bs4.find(id='js-initialData').text;
#print("hot_list_json_str:"+hot_list_json_str)
hot_list_json = json.loads(hot_list_json_str)
hot_list=hot_list_json["initialState"]['topstory']['hotList'];
for x in hot_list:
title,hotValue,linkUrl='','',''
title=x['target']['titleArea']['text']
hotValue=x['target']['metricsArea']['text']
linkUrl=x['target']['link']['url']
l.append({"title":title,"hotValue":hotValue,'url':linkUrl});
return l;
#获取所有的热点信息为List
def getAll2List(self):
return {'baiduHotSearch':self.baiduHotSearch(),
'baiduHotTalk':self.baiduHotTalk(),
'weiboHotSearch':self.weiboHotSearch(),
'weiboYW':self.weiboYW(),
'zh_rb':self.zh_rb()}
#获取所有的热点信息为HTML
def getAll2HTML(self):
html='''<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
</head>
<body>
<table>
<tr>
<td valign="top" style='background-color:LightSkyBlue;width:350px;'>
一、百度-热搜榜
</td>
<td valign="top" style='background-color:LightSkyBlue;width:350px;'>
二、百度-热议榜
</td>
<td valign="top" style='background-color:LightSkyBlue;width:350px;'>
三、新浪-热搜榜
</td>
<td valign="top" style='background-color:LightSkyBlue;width:350px;'>
四、新浪-要闻榜
</td>
</tr>
<tr>
<td valign="top">
<ui>{baiduHotSearch}</ui>
</td>
<td valign="top">
<ui>{baiduHotTalk}</ui>
</td>
<td valign="top">
<ui>{weiboHotSearch}</ui>
</td>
<td valign="top">
<ui>{weiboYW}</ui
</td>
</tr>
<tr>
<td colspan='4' valign="top" style='background-color:LightSkyBlue;'>
五、知乎-热榜
</td>
</tr>
<tr>
<td colspan='4' valign="top">
<ui>{zh_rb}</ui>
</td>
</tr>
</table>
</body>
</html>
'''
baiduHotSearch_html=''
baiduHotTalk_html=''
weiboHotSearch_html=''
weiboYW_html=''
zh_rb_html=''
template_span="<li><a style='color: black;font-size:8px;' target='_blank' href = '{url}'>{title} - <span style='color:red;'> {hotValue}</span></a> </li>";
baiduHotSearch_list=self.baiduHotSearch();
for item in baiduHotSearch_list:
baiduHotSearch_html=baiduHotSearch_html+template_span.format(title=item["title"],hotValue=item["hotValue"],url=item["url"])
baiduHotTalk_list=self.baiduHotTalk();
for item in baiduHotTalk_list:
baiduHotTalk_html=baiduHotTalk_html+template_span.format(title=item["title"],hotValue=item["hotValue"],url=item["url"])
weiboHotSearch_list=self.weiboHotSearch();
for item in weiboHotSearch_list:
weiboHotSearch_html=weiboHotSearch_html+template_span.format(title=item["title"],hotValue=item["hotValue"],url=item["url"])
weiboYW_list=self.weiboYW();
for item in weiboYW_list:
weiboYW_html=weiboYW_html+template_span.format(title=item["title"],hotValue=item["hotValue"],url=item["url"])
zh_rb_list=self.zh_rb();
for item in zh_rb_list:
zh_rb_html=zh_rb_html+template_span.format(title=item["title"],hotValue=item["hotValue"],url=item["url"])
#baiduHotTalk_list=baiduHotTalk();
#weiboHotSearch_list=weiboHotSearch();
#weiboYW_list=weiboYW();
#zh_rb_list=zh_rb();
html=html.format(baiduHotSearch=baiduHotSearch_html,
baiduHotTalk=baiduHotTalk_html,
weiboHotSearch=weiboHotSearch_html,
weiboYW=weiboYW_html,
zh_rb=zh_rb_html)
#print(html)
return html;
四、Web请求处理(httpResponse.py)
#coding=utf-8
from GetHotLines import *
#响应HTTP请求
class httpResponse(object) :
_getHotline=None;
def __init__(self):
self._getHotline=GetHotLine();
def handle_login(self,environ, start_response):
start_response('200 OK', [('Content-Type', 'text/html')])
return [b'login']
def handle_hotline(self,environ, start_response):
start_response('200 OK', [('Content-Type', 'text/html')])
return [self._getHotline.getAll2HTML().encode('utf-8')];
def deal(self,environ, start_response):
method = environ['REQUEST_METHOD']
path = environ['PATH_INFO']
if method=='GET' and path=='/login':
return self.handle_login(environ, start_response);
if method=='GET' and path=='/hotline':
return self.handle_hotline(environ, start_response);
start_response('200 OK', [('Content-Type', 'text/html')])
return [b'<h1>Hello, Python web!</h1>']
五、Web应用定义(myServer.py)
#coding=utf-8
#创建HTTP请求类
# 从wsgiref模块导入:
from wsgiref.simple_server import make_server
from httpResponse import httpResponse
class myServer(object):
def run(self):
app=httpResponse();
# 创建一个服务器,IP地址为空,端口是8000,处理函数是application:
httpd = make_server('', 8080, app.deal)
print('Serving HTTP on port 8080...')
# 开始监听HTTP请求:
httpd.serve_forever()
六、Web应用启动(run.py)
#coding=utf-8
#测试服务
import myServer
s=myServer.myServer();
s.run();
七、代码结构