python博客园、知乎、掘金爬取知乎文章及问答

bs主要爬取

def getSoup(url,herders):
    req = urllib.request.Request(url,headers=herders)
    response=urllib.request.urlopen(req)
    html =str(response.read(),'utf-8')
    # print(html)
    return BeautifulSoup(html,'lxml')

博客园

header_child = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
     # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64) AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/537.36 115Broswer/6.0.3',
    'Referer':'https://juejin.cn/',
    'Connection':'keep-alive',
    'Cookie':'..'# 对应的登录知乎的Cookie
}

def getData_cnblog(soup):
    # print(soup)
    soup = soup.select('.post-item-title')
    for item in soup:
        f = 'data/article/cnblog/'
        title = item.get_text();
        title = title.replace('/','、')
        # print(title)
        src = item['href']
        f = f + title.strip()+'.txt'
        header_child['Referer'] = src
        # 每一个子页面
        soup_c = getSoup(src,header_child);
        # 获取到每个子页面的文本
        print(f)
        for item in soup_c.select('#cnblogs_post_body p'):
            # print(item.get_text())
            txt = item.get_text()
            with open(f,'a') as file:
                file.write(str(txt)+'\n') 
         
soup = getSoup('https://www.cnblogs.com/cate/networksecurity/',herders)
getData_cnblog(soup)

知乎

# 知乎文章
header_childzhihu = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
     # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64) AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/537.36 115Broswer/6.0.3',
    'Referer':'https://www.zhihu.com/',
    'Connection':'keep-alive',
    'Cookie':'...'#登录知乎的cookie
}
def getData_zhihu(soup):
    soup = soup.select(".ContentItem-title a")
    for item in soup:
        title = item.get_text()
        title = title.replace('/','、')
        f = 'data/article/zhihu/'+title.strip()+'.txt'
        print(f)
        src = 'https:' + item['href']
        soup1 = getSoup(src,header_childzhihu)
        for item1 in soup1.select('.RichText p'):
            txt = item1.get_text()
            with open(f,'a') as file:
                file.write(str(txt)+'\n')
import time
for i in range(0,500):
    soup = getSoup('https://www.zhihu.com/',header_childzhihu)
    getData_zhihu(soup)
    time.sleep(1)

掘金

# 掘金文章爬取
labels = ['后端','前端','Android','iOS','人工智能','开发工具','阅读','其他']
def getData(soup):
    # 获取到标题并将它作为文件名
    title = ''
    count = 0
    if soup.select('.article-title'):
        title = soup.select('.article-title')[0].get_text()
        title = title.replace('/','、')
        count = count + 1
        # print(count)
        # print(soup.select('.article-title'))
    else:
        return;
    f = 'data/juejin/'+ title.strip() + '.txt'
    cate = ''
    for li in soup.select('.tag-title'):
        # 获取分类文本
        cateTxt = li.get_text()
        if cateTxt in labels:
            cate = cateTxt
        # category.append(cateTxt)
    # print(cate)
    with open(f,'w') as file1:
        file1.write(cate+'\n')
    for p in soup.select('.markdown-body p'):
        # 获取文章主要文本
        txt = p.get_text();
        with open(f,'a') as file:
            file.write(str(p.get_text())+'\n')
# 读取不同的文章,这里需要到掘金上手动收集文章id,因为掘金是使用ajax动态获取数据的
f = open('data/id.txt',encoding='utf-8')
while True:
    line = f.readline()
    if line:
        url = baseUrl + line
        soup = getSoup(url,herders) #调用getSoup方法
        getData(soup)
    else:
        break
  • 1
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值