1.创建工程
cmd命令下输入
scrapy startproject tipdmScrapy2 E:\桌面\Python网络爬虫实战_project\tipdmSpider2
2.获取所有页面的url
创建脚本文件
寻找url规则
只需找到最后一页的xpath
//*[@id="t251"]/div[3]/div/a[6]
import scrapy
class SpriderAllSpider(scrapy.Spider):
name = 'sprider_all'
allowed_domains = ['www.tipdm.com']
start_urls = ['http://www.tipdm.com/gsxw/index.jhtml']
def parse(self, response):
number = int(response.xpath('//*[@id="t251"]/div[3]/div/a[6]/text()').extract()[0])
url_all = ['http://www.tipdm.com/gsxw/index_{}.jhtml'.format(i)for i in range(1, number+1)]
3.获取每个页面的新闻二次页面
import scrapy
from scrapy.http import Request
from tipdmScrapy.items import TipdmscrapyItem
class SpriderAllSpider(scrapy.Spider):
name = 'sprider_all'
allowed_domains = ['www.tipdm.com']
start_urls = ['http://www.tipdm.com/gsxw/index.jhtml']
def parse(self, response):
number = int(response.xpath('//*[@id="t251"]/div[3]/div/a[6]/text()').extract()[0])
url_all = ['http://www.tipdm.com/gsxw/index_{}.jhtml'.format(i)for i in range(1, number+1)]
for url in url_all:
yield Request(url, callback=self.parse_url, dont_filter=True)
# 第二个方法
def parse_url(self, response):
urls = response.xpath('//*[@id="t251"]/div/div[3]/h1/a/@href').extract() # 各新闻页面中所有新闻标题的二次页面url
for url_sub in urls:
yield Request(url_sub, callback=self.parse_text, dont_filter=True)
# 第二个方法
def parse_text(self, response):
response.xpath('/html/body/div[2]/div/div[1]/div[2]/h1/text()').extract() # 新闻标题
response.xpath('/html/body/div[2]/div/div[1]/div[2]/div/div[1]/span[1]/text()') # 新闻发布时间
response.xpath('/html/body/div[2]/div/div[1]/div[2]/div/div[1]/span[3]/text()') # 浏览次数
'\n'.join(response.xpath('/html/body/div[2]/div/div[2]/p/text()').extract()) # 新闻文本内容
编写完成回到items.py脚本定义类
from tipdmScrapy.items import TipdmscrapyItem
# 第二个方法
def parse_text(self, response):
item = TipdmscrapyItem()
item['title'] = response.xpath('/html/body/div[2]/div/div[1]/div[2]/h1/text()').extract() # 新闻标题
item['times'] = response.xpath('/html/body/div[2]/div/div[1]/div[2]/div/div[1]/span[1]/text()') # 新闻发布时间
item['view_count'] = response.xpath('/html/body/div[2]/div/div[1]/div[2]/div/div[1]/span[3]/text()') # 浏览次数
item['text'] = '\n'.join(response.xpath('/html/body/div[2]/div/div[2]/p/text()').extract()) # 新闻文本内容
return item
运行程序,保存数据
进入pipelines.py脚本
导入pandas
import pandas as pd
再进入settings.py文档
执行
scrapy crawl sprider_all
data.to_csv("new_all.csv", mode='a+', index=None, encoding="utf-8-sig", header=None)