我们开始因创建一个新的scrapy项目:(在cmd开始)
cd C:\Users\xyx\desktop
users后面使用自己的名字
然后运行:
scrapy startproject financeSpider
在items.py里面输入下面字段
import scrapy
class FinancespiderItem(scrapy.Item):
title=scrapy.Field()
link = scrapy.Field()
content =scrapy.Field()
然后在cmd输入以下命令来制定爬虫范围:(记得进入路径)
现在记得把setting.py里面的注释掉:(该为下面样子)
然后我们在finance.py里面写入代码:
import scrapy
from bs4 import BeautifulSoup
from financeSpider.items import FinancespiderItem
class FinanceSpider(scrapy.Spider):
name = "finance"
allowed_domains = ["so.eastmoney.com"]
start_urls = ["https://finance.eastmoney.com/a/cywjh_1.html"]
url_head=['https://finance.eastmoney.com/a/cywjh_1']
url_end='.html'
def start_requests(self):
for i in range(0,4):
url=self.url_head+str(i)+str(i)+self.url_end
print("当前页面是:",url)
yield scrapy.Request(url=url,callback=self.parse1)
def parse1(self,response):
soup=BeautifulSoup(response.text,'lxml')
title_list=soup.find_all('p',class_="title")
for i in range(len(title_list)):
item=FinancespiderItem()
title=title_list[i].a.text.strip()
link=title_list[i].a["href"]
item["title"]=title
item["link"]=link
yield scrapy.Request(url=link,meta={'item':item},callback=self.parse2)
def parse2(self,response):
item=response.meta['item']
soup=BeautifulSoup(response.text,'lxml')
content=soup.find("div",id="ContenBody").text.strip()
content=content.replace("\n","")
item["content"]=content
yield item
我们最后还要修改pipelines.py通道里面的数据:
class FinancespiderPipeline(object):
file_path="C:/Users/xuyix/Desktop/financeSpider/result.txt"
def __init__(self):
self.article=open(self.file_path,"a+",encoding="utf8")
def process_item(self, item, spider):
title=item["title"]
link=item["link"]
content=item["content"]
output=title+'\t'+link+'\t'+content+'\n\n'
self.article.write(output)
return item
完成上面的操作后,就在cmd执行:
scrapy crawl finance
出现这个就表示成功了。