根据关键词爬取百度知道问答内容

最新推荐文章于 2023-01-10 21:24:19 发布

鱼速

最新推荐文章于 2023-01-10 21:24:19 发布

阅读量1.2k

点赞数

分类专栏： Python 文章标签： python

本文链接：https://blog.csdn.net/weixin_45112546/article/details/104336079

版权

Python 专栏收录该内容

6 篇文章

订阅专栏

利用了BeautifulSoup+CSS选择器，
代码如下

import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import time

#获取URL的函数
def get_url(url):
    wb_data = requests.get(url)
    wb_data.encoding = ('gbk')
    soup = BeautifulSoup(wb_data.text, 'html.parser')
    return soup

#构造爬取函数
def get_page(url,data=None):

    soup = get_url(url)

    #捕获href
    for a in soup.find_all('a',class_ = 'ti', href=True):
        URL = a['href']
        print("Found the URL:", a['href'])
        soup_1 = get_url(URL)
        time.sleep(0.2)
        titles = soup_1.select('span.ask-title')
        answer = soup_1.select("div[class = 'best-text mb-10']")
        # 在获取到的数据提取有效内容
        for title, answer in zip(titles, answer):
            data = [
                title.get_text(),
                # replace去除一些和主题不相关的字符
                answer.get_text().replace("\n", "").replace("展开全部", "").replace("\u3000\u3000","")
            ]
            saveFile(data)
            print(data)


#迭代页数
def get_more_page(start,end):
    for one in range(start,end,10):
        get_page(url+str(one))
        time.sleep(0.1)

#定义保存文件函数
def saveFile(data):
    path = 'data_1\{}.csv'.format(keyword)
    file = open(path, 'a')
    file.write(str(data).replace('[',' ').replace('\'',' ').replace(']',' '))
    file.write('\n')
    file.close()

if __name__=="__main__":
    with open('keyword.txt', 'r', encoding='UTF-8') as f:
        for line in f.readlines():
            keyword = line.strip()
            print(keyword)
            # replace去除一些和主题无关的neir
            # 汉字转GBK码
            word = repr("{}".format(keyword).encode("gbk",'ignore')).replace("\\x","%").replace('b\'%',"%").replace("\'","")
            # repr() 函数将对象转化为供解释器读取的形式。
            print(word)
            # 定义将要爬取的URL
            url = 'https://zhidao.baidu.com/search?word=' + word + '&ie=gbk&site=-1&sites=0&date=0&pn='
            # int里面是要爬取的页码数，可以设为变量
            get_more_page(0, int(8) * 10)