通过python爬取SCDN论坛的标题,返回请求网址
使用requests请求网址 lxml中etree请求数据 time延时 openpyxl保存再excel中
网站共有100页数据,5000个论坛。
我们可以输入爬取的页数:
运行代码:
将数据储存在excel中:
源代码如下:
在这里插入代码片
# _*_ coding:utf _*_
# 邮箱:3195841740@qq.com
# 人员:21292
# 日期:2020/3/10 10:42
# 工具:PyCharm
import requests
from lxml import etree
import time
import openpyxl
headers = {
'Host': 'bbs.csdn.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Cookie': 'uuid_tt_dd=10_18634657010-1580480278567-205683; dc_session_id=10_1580480278567.959197; __gads=ID=442022e467108b24:T=1580480282:S=ALNI_Ma1eS1wB7Jxj3O7hnYAWcbLta-ROg; UserName=DHKSHFJ; UserInfo=00291c5cf64747cc8c74b36657573e33; UserToken=00291c5cf64747cc8c74b36657573e33; UserNick=%E8%BF%81%E5%B0%B10423; AU=EE3; UN=DHKSHFJ; BT=1580538639376; p_uid=U000000; searchHistoryArray=%255B%2522%25E8%25B1%2586%25E7%2593%25A3%25E7%2588%25AC%25E8%2599%25AB%2522%252C%2522%25E7%2588%25AC%25E8%2599%25AB%2522%252C%2522scrapy%25E7%2588%25AC%25E5%258F%2596%25E5%2588%25B0%25E7%259A%2584%25E6%2595%25B0%25E6%258D%25AE%25E5%2586%2599%25E5%2585%25A5%25E6%2596%2587%25E4%25BB%25B6%2522%252C%2522requests%2522%252C%2522tkinter%2522%255D; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1583729124,1583729168,1583738250,1583738329; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=5744*1*DHKSHFJ!6525*1*10_18634657010-1580480278567-205683; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1583762044; c_ref=https%3A//www.baidu.com/link%3Furl%3DYRI-WAKUg0fJa8HulbdWQC291VDpdN-rTAvCbTu45gnxD4WF6iz2JUM7jAHAaRsr-vT8h3ulBiZWga31NFtQrq%26wd%3D%26eqid%3D975e2ab900002c82000000055e65edd2; dc_tos=q6xhxh; announcement=%257B%2522isLogin%2522%253Atrue%252C%2522announcementUrl%2522%253A%2522https%253A%252F%252Fblog.csdn.net%252Fblogdevteam%252Farticle%252Fdetails%252F103603408%2522%252C%2522announcementCount%2522%253A0%252C%2522announcementExpire%2522%253A3600000%257D; _csdn_newbbs_session=BAh7CEkiD3Nlc3Npb25faWQGOgZFRkkiJTRlNjc5YzY2YzQ4ODAwNzZlZDg1YmRhMjc5ZDJiYTY1BjsAVEkiDHVzZXJfaWQGOwBGaQSZ%2FacESSIQX2NzcmZfdG9rZW4GOwBGSSIxSWtITHZFc2hDSmdpbXdzTlU4QytFL2RtWiswNkhrTGd6WjhTVUpXN01NWT0GOwBG--44720c2c33cce034ee7b1f2d2f3f04b0a5688ff3; TY_SESSION_ID=d958c2e9-b7dd-4cb7-8264-d68d2605cb95',
'Upgrade-Insecure-Requests': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'TE': 'Trailers'
}
#创建一个空的字典,储存爬取的key,value
blog_detials = {}
#爬取论坛的名称与对应的网页
def get_name(url):
response = requests.get(url, headers=headers)
text = response.content.decode('utf-8')
html = etree.HTML(text)
#名称
names = html.xpath('//div[@class = "list_1"]/ul/li/a/text()')
#网页
urls = html.xpath('//div[@class = "list_1"]/ul/li/a/@href')
#提取这个网页名称对应的网页
for each in range(len(names)):
print(names[each])
#字典赋值
blog_detials[names[each]] = 'https://bbs.csdn.net' + urls[each]
#保存到excel中
save_openpyxl(blog_detials)
#延时
time.sleep(0.005)
#储存函数line代表excel的行数
def save_openpyxl(blog_detials, line=[2]):
file = openpyxl.Workbook()
sheet = file.active
sheet.title = '论坛'
sheet['A1'] = '序号'
sheet['B1'] = "名称"
sheet['C1'] = '请求网址'
line[0] = 2
for each in blog_detials:
#对单元格赋值
sheet['A' + str(line[0])] = line[0] - 1
sheet['B' + str(line[0])] = each
sheet['C' + str(line[0])] = blog_detials[each]
line[0] = line[0] + 1
file.save('CSDN.xlsx')
#开始运行程序,进行爬取
def start(page):
for page in range(1, page+1, 1):
#每页的url
url = 'https://bbs.csdn.net/tech_hot_topics?page=' + str(page)
print('*' * 30 + '正在爬取第' + str(page) + '页' + '*' * 30)
get_name(url)
time.sleep(0.005)
if __name__ == '__main__':
#该论坛共有100页
page = int(input("CSDN共100页,请输入你与要爬取页面个数:"))
#爬取该网页
start(page)
更多电影爬虫:
豆瓣:https://blog.csdn.net/DHKSHFJ/article/details/104739831
电影天堂:https://blog.csdn.net/DHKSHFJ/article/details/104740106