发现了一个教Python大佬的网站:https://cuiqingcai.com/category/technique,于是利用从上面学的知识,把这个网站 技术杂谈栏目下的文章爬了一下...
主要使用的模块:
requests、 BeautifulSoup、 Workbook.openpyxl、 time
源码:
from openpyxl import Workbook
from bs4 import BeautifulSoup
import requests
import time
time1 = time.time()
def down_url():
head = []
head_url = []
head_user = []
head_clock = []
head_eye = []
head_comments = []
for i in range(1, 20):
print('正在抓取第' + str(i) + "页.................................")
url = 'https://cuiqingcai.com/category/technique/python/page/' + str(i)
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/47.0.2526.80 Safari/537.36 '
}
res = requests.get(url, headers=header).content
soup = BeautifulSoup(res, "html.parser")
div = soup.find('div', class_="content")
for article in div.find_all('article'):
head.append(article.h2.a['title'])
head_url.append(article.h2.a['href'])
auth = article.find('p', class_="auth-span").find_all('span', class_="muted")
for auth_list in auth: # 写入文章作者相关信息
if auth.index(auth_list) == 0:
user = auth_list.get_text()
head_user.append(user)
elif auth.index(auth_list) == 1:
clock = auth_list.get_text()
head_clock.append(clock)
elif auth.index(auth_list) == 2:
eye = auth_list.get_text()
head_eye.append(eye)
elif auth.index(auth_list) == 3:
comments = auth_list.get_text()
head_comments.append(comments)
time2 = time.time()
print('总共耗时 %s s' % round((time2 - time1), 2))
return head, head_url, head_user, head_clock, head_eye, head_comments
def workbook(head, head_url, head_user, head_clock, head_eye, head_comments):
wb = Workbook()
ws1 = wb.active
ws1['A1'] = 'name'
ws1['B1'] = 'url'
ws1['C1'] = 'user'
ws1['D1'] = 'clock'
ws1['E1'] = 'eye'
ws1['F1'] = 'comments'
for i in range(len(head)):
ws1['A%s' % (i + 2)] = head[i]
ws1['B%s' % (i + 2)] = head_url[i]
ws1['C%s' % (i + 2)] = head_user[i]
ws1['D%s' % (i + 2)] = head_clock[i]
ws1['E%s' % (i + 2)] = head_eye[i]
ws1['F%s' % (i + 2)] = head_comments[i]
wb.save(filename='Python网址.xlsx')
head, head_url, head_user, head_clock, head_eye, head_comments = down_url()
workbook(head, head_url, head_user, head_clock, head_eye, head_comments)