Python + Selenium(知乎50条热榜以及热榜内答案信息存储)

import re
import time
import pymysql
import requests
from bs4 import BeautifulSoup


#  获取热榜的HTML
def GetContends(url):
    cookie = '这里填写你的cookie'
    headers = {
        'User-Agent': 'Mozilla/5.0',
        'Cookie': cookie
    }
    html = requests.get(url, headers=headers).text
    return html


def Create_HotTitles_Table():

    db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, database='zhihu_hot_topic')
    cursor = db.cursor()

    sql_create = 'create table if not exists Hot_titles(id int(100) auto_increment, question_id varchar(255), question_time varchar(100), question_rank varchar(100), question_topic varchar(255), question_url varchar(255), question_level varchar(255), primary key (id))'

    cursor.execute(sql_create)

    db.close()

def Insert_HotTitles_Data(question_id, question_time, question_rank, question_topic, question_url, question_level):
    db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, database='zhihu_hot_topic')
    cursor = db.cursor()

    sql_insert = "insert into hot_titles (question_id, question_time, question_rank, question_topic, question_url, question_level) values (%s,%s,%s,%s,%s,%s)"
    val = (question_id, question_time, question_rank, question_topic, question_url, question_level)

    try:
        cursor.execute(sql_insert, val)
        db.commit()
    except:
        db.rollback()
    db.close()


if __name__ == '__main__':

    url_hot = "https://www.zhihu.com/hot"

    #  共50个热榜,每次都处理一条热榜
    while True:
        soup = BeautifulSoup(GetContends(url_hot), 'html.parser')
        contend = list(soup.find_all('section', 'HotItem'))
        Create_HotTitles_Table()

        for i in range(50):
            try:

                if i <= 3:
                    question_rank = re.findall(r'HotItem-rank HotItem-hot">(.*?)</div>', str(contend[i]))[0]
                else:
                    question_rank = re.findall(r'"HotItem-rank">(.*?)</div>', str(contend[i]))[0]

                question_level = re.findall(r'</svg>(.*?) 万热度<span', str(contend[i]))[0]  # 热榜热度
                question_title = re.findall(r'target="_blank" title="(.*?)"><h2', str(contend[i]))[0]  # 热榜标题
                question_url = re.findall(r'href="(.+?)"', str(contend[i]))[0]  # 热榜网址
                question_id = question_url[31:]   # 问题id
                question_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

                print(" ----------------------------------------------------------------------------------------------------------------------")
                print(" 热榜时间戳:%s" % question_time)
                print(" 热榜排名:%s" % question_rank)
                print(" 热榜ID:%s" % question_id)
                print(" 热榜主题:%s" % question_title)
                print(" 热榜网址:%s" % question_url)
                print(" 热榜热度(万):%s" % question_level)
                print(" ----------------------------------------------------------------------------------------------------------------------")

                Insert_HotTitles_Data(question_id, question_time, question_rank, question_title, question_url, question_level)
            except:
                continue
        time.sleep(60)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值