python博客园、知乎、掘金爬取知乎文章及问答

最新推荐文章于 2024-05-03 16:49:33 发布

Hler

最新推荐文章于 2024-05-03 16:49:33 发布

阅读量776

点赞数 1

分类专栏： python 文章标签： python 开发语言爬虫

本文链接：https://blog.csdn.net/weixin_45790562/article/details/128388735

版权

python 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

bs主要爬取

def getSoup(url,herders):
    req = urllib.request.Request(url,headers=herders)
    response=urllib.request.urlopen(req)
    html =str(response.read(),'utf-8')
    # print(html)
    return BeautifulSoup(html,'lxml')

博客园

header_child = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
     # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64) AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/537.36 115Broswer/6.0.3',
    'Referer':'https://juejin.cn/',
    'Connection':'keep-alive',
    'Cookie':'..'# 对应的登录知乎的Cookie
}

def getData_cnblog(soup):
    # print(soup)
    soup = soup.select('.post-item-title')
    for item in soup:
        f = 'data/article/cnblog/'
        title = item.get_text();
        title = title.replace('/','、')
        # print(title)
        src = item['href']
        f = f + title.strip()+'.txt'
        header_child['Referer'] = src
        # 每一个子页面
        soup_c = getSoup(src,header_child);
        # 获取到每个子页面的文本
        print(f)
        for item in soup_c.select('#cnblogs_post_body p'):
            # print(item.get_text())
            txt = item.get_text()
            with open(f,'a') as file:
                file.write(str(txt)+'\n') 
         
soup = getSoup('https://www.cnblogs.com/cate/networksecurity/',herders)
getData_cnblog(soup)

知乎

# 知乎文章
header_childzhihu = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
     # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64) AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/537.36 115Broswer/6.0.3',
    'Referer':'https://www.zhihu.com/',
    'Connection':'keep-alive',
    'Cookie':'...'#登录知乎的cookie
}
def getData_zhihu(soup):
    soup = soup.select(".ContentItem-title a")
    for item in soup:
        title = item.get_text()
        title = title.replace('/','、')
        f = 'data/article/zhihu/'+title.strip()+'.txt'
        print(f)
        src = 'https:' + item['href']
        soup1 = getSoup(src,header_childzhihu)
        for item1 in soup1.select('.RichText p'):
            txt = item1.get_text()
            with open(f,'a') as file:
                file.write(str(txt)+'\n')
import time
for i in range(0,500):
    soup = getSoup('https://www.zhihu.com/',header_childzhihu)
    getData_zhihu(soup)
    time.sleep(1)

掘金

# 掘金文章爬取
labels = ['后端','前端','Android','iOS','人工智能','开发工具','阅读','其他']
def getData(soup):
    # 获取到标题并将它作为文件名
    title = ''
    count = 0
    if soup.select('.article-title'):
        title = soup.select('.article-title')[0].get_text()
        title = title.replace('/','、')
        count = count + 1
        # print(count)
        # print(soup.select('.article-title'))
    else:
        return;
    f = 'data/juejin/'+ title.strip() + '.txt'
    cate = ''
    for li in soup.select('.tag-title'):
        # 获取分类文本
        cateTxt = li.get_text()
        if cateTxt in labels:
            cate = cateTxt
        # category.append(cateTxt)
    # print(cate)
    with open(f,'w') as file1:
        file1.write(cate+'\n')
    for p in soup.select('.markdown-body p'):
        # 获取文章主要文本
        txt = p.get_text();
        with open(f,'a') as file:
            file.write(str(p.get_text())+'\n')
# 读取不同的文章，这里需要到掘金上手动收集文章id，因为掘金是使用ajax动态获取数据的
f = open('data/id.txt',encoding='utf-8')
while True:
    line = f.readline()
    if line:
        url = baseUrl + line
        soup = getSoup(url,herders) #调用getSoup方法
        getData(soup)
    else:
        break