python3爬虫知乎_Python3爬虫实践--网易科技滚动新闻爬取

本文展示了如何使用Python3爬虫技术抓取网易科技的滚动新闻,包括获取新闻标题、内容,以及将数据存储到Excel文件的过程。
摘要由CSDN通过智能技术生成

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# @Time : 2019/3/13 13:08

# @Author : cunyu

# @Site : cunyu1943.github.io

# @File : NetaseNewsSpider.py

# @Software: PyCharm

import requests

from lxml import etree

import xlwt

headers = {

"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"

}

# 根据url获取刚网页中的新闻详情页的网址列表

def getNewsDetailUrlList(url):

""":param url: 每页的URL:return newDetailList:每页包含的新闻详情URL"""

response = requests.get(url, headers=headers)

html = response.content.decode('gbk')

selector = etree.HTML(html)

newsDetailList = selector.xpath('//ul[@id="news-flow-content"]//li//div[@class="titleBar clearfix"]//h3//a/@href')

return newsDetailList

# 获取新闻标题

def getNewsTitle(detailUrl):

""":param detailUrl:新闻详情url:return newsTitle:新闻标题"""

response = requests.get(detailUrl, headers=headers)

html = response.content.decode('gbk')

selector = etree.HTML(html)

newsTitle = selector.xpath('//div[@class="post_content_main"]//h1/text()')

return newsTitle

# 获取新闻详情内容

def getNewsContent(detailUrl):

""":param detailUrl: 新闻详情url:return newsContent: 新闻内容详情"""

response = requests.get(detailUrl, headers=headers)

html = response.content.decode('gbk')

selector = etree.HTML(html)

newsContent = selector.xpath('//div[@class="post_text"]//p/text()')

return newsContent

# 将新闻标题和内容写入文件

TODO

# 获取翻页网址列表

def getUrlList(baseUrl, num):

""":param baseUrl:基础网址:param num: 翻到第几页:return urlList: 翻页网址列表"""

urlList = []

urlList.append(baseUrl)

for i in range(2, num+1):

urlList.append(baseUrl + "_" + str(i).zfill(2))

return urlList

if __name__ == '__main__':

baseUrl = "http://tech.163.com/special/gd2016"

num = int(input('输入你要爬取的页数: '))

urlList = getUrlList(baseUrl, num)

print(urlList)

detailUrl = []

for url in urlList:

for i in getNewsDetailUrlList(url):

detailUrl.append(i)

print(detailUrl)

print(getNewsTitle(detailUrl[0]))

print(getNewsContent(detailUrl[0]))

# 将爬取的文本存入文本文件

# with open('news.txt', 'w', encoding='utf-8') as f:

# for i in detailUrl:

# f.write(''.join(getNewsTitle(i)))

# f.write('\n')

# f.write(''.join(getNewsContent(i)))

# f.write('\n')

# print('文件写入成功')

# 将爬取得文本存入excel文件

# 创建一个Excel文件

workbook = xlwt.Workbook(encoding='utf-8')

news_sheet = workbook.add_sheet('news')

news_sheet.write(0, 0, 'Title')

news_sheet.write(0, 1, 'Content')

for i in range(len(detailUrl)):

# print(detailUrl[i])

news_sheet.write(i + 1, 0, getNewsTitle(detailUrl[i]))

news_sheet.write(i + 1, 1, getNewsContent(detailUrl[i]))

# 将写入操作保存到指定Excel文件中

workbook.save('网易新闻.xls')

print('文件写入成功')

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值