bs主要爬取
def getSoup(url,herders):
req = urllib.request.Request(url,headers=herders)
response=urllib.request.urlopen(req)
html =str(response.read(),'utf-8')
return BeautifulSoup(html,'lxml')
博客园
header_child = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Referer':'https://juejin.cn/',
'Connection':'keep-alive',
'Cookie':'..'
}
def getData_cnblog(soup):
soup = soup.select('.post-item-title')
for item in soup:
f = 'data/article/cnblog/'
title = item.get_text();
title = title.replace('/','、')
src = item['href']
f = f + title.strip()+'.txt'
header_child['Referer'] = src
soup_c = getSoup(src,header_child);
print(f)
for item in soup_c.select('#cnblogs_post_body p'):
txt = item.get_text()
with open(f,'a') as file:
file.write(str(txt)+'\n')
soup = getSoup('https://www.cnblogs.com/cate/networksecurity/',herders)
getData_cnblog(soup)
知乎
header_childzhihu = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Referer':'https://www.zhihu.com/',
'Connection':'keep-alive',
'Cookie':'...'
}
def getData_zhihu(soup):
soup = soup.select(".ContentItem-title a")
for item in soup:
title = item.get_text()
title = title.replace('/','、')
f = 'data/article/zhihu/'+title.strip()+'.txt'
print(f)
src = 'https:' + item['href']
soup1 = getSoup(src,header_childzhihu)
for item1 in soup1.select('.RichText p'):
txt = item1.get_text()
with open(f,'a') as file:
file.write(str(txt)+'\n')
import time
for i in range(0,500):
soup = getSoup('https://www.zhihu.com/',header_childzhihu)
getData_zhihu(soup)
time.sleep(1)
掘金
labels = ['后端','前端','Android','iOS','人工智能','开发工具','阅读','其他']
def getData(soup):
title = ''
count = 0
if soup.select('.article-title'):
title = soup.select('.article-title')[0].get_text()
title = title.replace('/','、')
count = count + 1
else:
return;
f = 'data/juejin/'+ title.strip() + '.txt'
cate = ''
for li in soup.select('.tag-title'):
cateTxt = li.get_text()
if cateTxt in labels:
cate = cateTxt
with open(f,'w') as file1:
file1.write(cate+'\n')
for p in soup.select('.markdown-body p'):
txt = p.get_text();
with open(f,'a') as file:
file.write(str(p.get_text())+'\n')
f = open('data/id.txt',encoding='utf-8')
while True:
line = f.readline()
if line:
url = baseUrl + line
soup = getSoup(url,herders)
getData(soup)
else:
break