目录
一、批量爬虫百度百科数据(存为文本格式)
from gevent import monkey
monkey.patch_all()
import gevent,requests, bs4, openpyxl
from gevent.queue import Queue
import time
# work = Queue()
url_新一代信息技术 = 'https://baike.baidu.com/item/%E6%96%B0%E4%B8%80%E4%BB%A3%E4%BF%A1%E6%81%AF%E6%8A%80%E6%9C%AF/1146323?fr=aladdin'
url_新材料 = 'https://baike.baidu.com/item/%E6%96%B0%E6%9D%90%E6%96%99/4898312?fr=aladdin'
url_数字创意产业 = 'https://baike.baidu.com/item/%E6%95%B0%E5%AD%97%E5%88%9B%E6%84%8F%E4%BA%A7%E4%B8%9A'
url_人工智能 = 'https://baike.baidu.com/item/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD/9180'
url_list=[url_新一代信息技术,url_人工智能,url_数字创意产业,url_新材料]
for url in url_list:
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
wb = openpyxl.Workbook()
sheet = wb.active
bs = bs4.BeautifulSoup(res.text, 'html.parser')
bs1 = bs.find('dd', class_="lemmaWgt-lemmaTitle-title")
title = bs1.find('h1').text
bs2 = bs.find('div', class_="lemma-summary")
summary = bs2.text
summary = str(summary).replace('[1]', '')
# print(summary)
titles = bs.find_all('div', class_='para-title level-2')
comments = bs.find_all('div', class_='para')
sheet.append([title, summary])
list = bs.find_all('div')
for lists in list:
if lists in titles:
a = lists.text.replace('编辑', '')
# sheet.append([title, a])
with open('.\wenben\{}.txt'.format(title), 'a', encoding='utf-8') as file:
file.write(''.join(a).replace('\n',''))
elif lists in comments:
a = lists.text
# sheet.append([title, a])
with open('.\wenben\{}.txt'.format(title), 'a', encoding='utf-8') as file:
file.write(''.join(a))
else:
pass
# file.close()
time.sleep(2)
二、批量爬虫百度百科数据(存为Excel格式)
from gevent import monkey
monkey.patch_all()
import gevent,requests, bs4, openpyxl
from gevent.queue import Queue
import time
# work = Queue()
url_新一代信息技术 = 'https://baike.baidu.com/item/%E6%96%B0%E4%B8%80%E4%BB%A3%E4%BF%A1%E6%81%AF%E6%8A%80%E6%9C%AF/1146323?fr=aladdin'
url_新材料 = 'https://baike.baidu.com/item/%E6%96%B0%E6%9D%90%E6%96%99/4898312?fr=aladdin'
url_数字创意产业 = 'https://baike.baidu.com/item/%E6%95%B0%E5%AD%97%E5%88%9B%E6%84%8F%E4%BA%A7%E4%B8%9A'
url_人工智能 = 'https://baike.baidu.com/item/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD/9180'
url_list=[url_新一代信息技术,url_人工智能,url_数字创意产业,url_新材料]
for url in url_list:
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
wb = openpyxl.Workbook()
sheet = wb.active
bs = bs4.BeautifulSoup(res.text, 'html.parser')
bs1 = bs.find('dd', class_="lemmaWgt-lemmaTitle-title")
title = bs1.find('h1').text
bs2 = bs.find('div', class_="lemma-summary")
summary = bs2.text
summary = str(summary).replace('[1]', '')
# print(summary)
titles = bs.find_all('div', class_='para-title level-2')
comments = bs.find_all('div', class_='para')
sheet.append([title, summary])
list = bs.find_all('div')
# with open('.\wenben\{}.txt'.format(title), 'a', encoding='utf-8') as file:
for lists in list:
if lists in titles:
a = lists.text.replace('编辑', '')
sheet.append([title, a])
# file.write('\n'.join([title, a]))
elif lists in comments:
a = lists.text
sheet.append([title, a])
# file.write('\n'.join([title, a]))
else:
pass
sheet.title = '{}'.format(title)
sheet['A1'] = '标题' # 加表头,给A1单元格赋值
sheet['B1'] = '内容' # 加表头,给B1单元格赋值
wb.save('.\pc\{}.xlsx'.format(title))
wb.close()
time.sleep(2)
爬虫结果
三、协程爬虫百度百科数据
from gevent import monkey
monkey.patch_all()
import gevent,requests, bs4, openpyxl
from gevent.queue import Queue
import time
wb = openpyxl.Workbook()
sheet = wb.active
work = Queue()
url_新一代信息技术 = 'https://baike.baidu.com/item/%E6%96%B0%E4%B8%80%E4%BB%A3%E4%BF%A1%E6%81%AF%E6%8A%80%E6%9C%AF/1146323?fr=aladdin'
work.put_nowait(url_新一代信息技术)
time.sleep(5)
url_新材料 = 'https://baike.baidu.com/item/%E6%96%B0%E6%9D%90%E6%96%99/4898312?fr=aladdin'
work.put_nowait(url_新材料)
time.sleep(5)
url_数字创意产业 = 'https://baike.baidu.com/item/%E6%95%B0%E5%AD%97%E5%88%9B%E6%84%8F%E4%BA%A7%E4%B8%9A'
work.put_nowait(url_数字创意产业)
time.sleep(5)
url_人工智能 = 'https://baike.baidu.com/item/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD/9180'
work.put_nowait(url_人工智能)
time.sleep(5)
def crawler():
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
while not work.empty():
url = work.get_nowait()
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
bs = bs4.BeautifulSoup(res.text, 'html.parser')
bs1 = bs.find('dd', class_="lemmaWgt-lemmaTitle-title")
title = bs1.find('h1').text
bs2 = bs.find('div', class_="lemma-summary")
summary = bs2.text
summary = str(summary).replace('[1]', '')
# print(summary)
titles = bs.find_all('div', class_='para-title level-2')
comments = bs.find_all('div', class_='para')
sheet.append([title, summary])
list = bs.find_all('div')
for lists in list:
if lists in titles:
a = lists.text.replace('编辑', '')
sheet.append([title, a])
elif lists in comments:
a = lists.text
sheet.append([title, a])
else:
pass
sheet.title = '{}'.format(title)
sheet['A1'] = '标题' # 加表头,给A1单元格赋值
sheet['B1'] = '内容' # 加表头,给B1单元格赋值
wb.save('.\pc\{}.xlsx'.format(title))
# tasks_list = []
# task = gevent.spawn(crawler)
# tasks_list.append(task)
# gevent.joinall(tasks_list)
# gevent.joinall(task)
crawler()
以上代码可以用来练手,这是我19年写的代码,今天拿出来测试了一下,竟然还可以用,哈哈,这里面有些细节可以根据自己的需求进行优化。