将python的flask、redis以及selenium结合起来,制作一个热点爬虫系统。本系统设计具体可以基于flask的web开发、selenium爬虫以及前端数据可视化
1.1 flaskweb开发部分
app.route("/towebweibo", methods=['POST'])
def to_Web_Weibo():
now_time=time.strftime('%Y-%m-%d %H:%M',time.localtime(time.time()))
post_data = {"1": [],"2": [],"3": [],"4": [],"5": [],"6": [],"7": [],"8": [],"9": [],"10": []}
pool = redis.ConnectionPool(host='192.168.20.100', port=6379, decode_responses=True)
r = redis.Redis(connection_pool=pool)
for i in range(1, 11):
print(r.get(f"titles{i}"))
print(r.get(f"links{i}"))
post_data[f"{i}"].append(r.get(f"titles{i}"))
post_data[f"{i}"].append(r.get(f"links{i}"))
post_data[f"{i}"].append(str(now_time))
print(post_data)
return jsonify(post_data)
@app.route("/showweb", methods=['GET','POST'])
def show_Web():
return render_template("activity.html")
前端界面显示文件夹(templates)
1.2 selenium爬虫部分+redis
ef running(sign):
pool = redis.ConnectionPool(host='192.168.20.100', port=6379, decode_responses=True)
r = redis.Redis(connection_pool=pool)
if sign=="A":
driver = webdriver.Chrome(options=option) # 调用带参数的谷歌浏览器
url = 'https://s.weibo.com/top/summary?summary?cate=realtimehot'
driver.get(url)
time.sleep(10)
for i in range(1, 11):
# 字条--
Content_Name = driver.find_element_by_xpath(f'//*[@id="pl_top_realtimehot"]/table/tbody/tr[{i}]/td[2]/a')
# 链接
Link_Name = Content_Name.get_attribute('href')
print(Content_Name.text, Link_Name)
r.set(f'titles{i}', f'{Content_Name.text}')
r.set(f'links{i}', f'{Link_Name}')
driver.quit()
if sign == "B":
driver = webdriver.Chrome(options=option) # 调用带参数的谷歌浏览器
url_1 = 'https://www.toutiao.com/'
driver.get(url_1)
time.sleep(10)
for j in range(1, 11):
Content_Name_1 = driver.find_element_by_xpath(
f'//*[@id="root"]/div/div[5]/div[2]/div[2]/div/div/div/ol/li[{j}]/a')
# //*[@id="root"]/div/div[5]/div[2]/div[2]/div/div/div/ol/li[1]/a
#//*[@id="root"]/div/div[5]/div[2]/div[2]/div/div/div/ol/li[1]/a/p
#//*[@id="root"]/div/div[5]/div[2]/div[2]/div/div/div/ol/li[1]/a
# 链接
Link_Name_0 = Content_Name_1.get_attribute('aria-label')
Link_Name_1 = Content_Name_1.get_attribute('href')
r.set(f'titlesB{j}', f'{Link_Name_0}')
r.set(f'linksB{j}', f'{Link_Name_1}')
driver.quit()
if sign == "C":
driver = webdriver.Chrome(options=option) # 调用带参数的谷歌浏览器
url_1 = 'https://news.qq.com/'
driver.get(url_1)
time.sleep(10)
for k in range(1, 11):
Content_Name_1 = driver.find_element_by_xpath(f'//*[@id="rankWrap"]/ul/li[{k}]/a')
# //*[@id="root"]/div/div[5]/div[2]/div[2]/div/div/div/ol/li[1]/a
# 链接
Link_Name_1 = Content_Name_1.get_attribute('href')
r.set(f'titlesC{k}', f'{Content_Name_1.text}')
r.set(f'linksC{k}', f'{Link_Name_1}')
driver.quit()
if sign == "D":
driver = webdriver.Chrome(options=option) # 调用带参数的谷歌浏览器
url_1 = 'https://news.china.com/domestic/index.html'
driver.get(url_1)
time.sleep(10)
for k in range(1, 11):
# //*[@id="js-info-flow"]/div[1]/ul/li[5]/h3/a
Content_Name_1 = driver.find_element_by_xpath(f'//*[@id="js-info-flow"]/div[1]/ul/li[{k}]/h3/a')
Link_Name_1 = Content_Name_1.get_attribute('href')
r.set(f'titlesD{k}', f'{Content_Name_1.text}')
r.set(f'linksD{k}', f'{Link_Name_1}')
driver.quit()
if sign == "E":
driver = webdriver.Chrome(options=option) # 调用带参数的谷歌浏览器
url_1 = 'https://www.chinanews.com.cn/china/'
driver.get(url_1)
time.sleep(10)
for k in range(1, 11):
# /html/body/div[3]/div[16]/div[5]/ul/li[5]/a
Content_Name_1 = driver.find_element_by_xpath(f'/html/body/div[3]/div[16]/div[5]/ul/li[{k}]/a')
Link_Name_1 = Content_Name_1.get_attribute('href')
print(Content_Name_1.text, Link_Name_1)
r.set(f'titlesE{k}', f'{Content_Name_1.text}')
r.set(f'linksE{k}', f'{Link_Name_1}')
driver.quit()
1.3 多线程控制多路数据爬取
def run(sign,changetime):
while True:
try:
running(sign)
time.sleep(changetime)
except:
continue
@app.route("/start", methods=['POST'])
def main():
ta = threading.Thread(target=run, args=("A",17))
tb = threading.Thread(target=run, args=("B",20))
tc = threading.Thread(target=run, args=("C",22))
td = threading.Thread(target=run, args=("D",27))
te = threading.Thread(target=run, args=("E",15))
ta.start()
time.sleep(10)
tb.start()
time.sleep(10)
tc.start()
time.sleep(10)
td.start()
time.sleep(10)
te.start()
return "success"
1.4 前端数据可视化
本文技术栈偏向于后端开发以及深度学习图像分割方面,对于前端还只停留在最基础的ajax数据调用上。
这里采用js里的setInterval做轮询,该方法还不够完美!
setInterval(function() {
$("#tabletest").empty()
$.ajax({
url:'http://127.0.0.1:9003/towebweibo',
type:'post',
dataType:'json',
success:function(data){
//方法中传入的参数data为后台获取的数据
console.log(data["1"][0]);
var content="";
for (let index=1; index < 11; index++) {
console.log(data[index][1]);
content=content+'<li>'+'<p class="fl"><b>'+'<a target=_blank href='+data[index][1]+'>热点热搜--</a>'+index+'</b><br>'+data[index][0]+'<br>'+'</p>'+'<p class="fr pt17">'+data[index][2]+'</p>'+'</li>'
}
$("#tabletest").append(content)
}
})
}, 25000);
1.4 界面显示