1.实战spider编写:
结构化数据存储到item对象中
import scrapy
class MyfirstspjtItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
urlname = scrapy.Field()
urlkey = scrapy.Field()
urlcr = scrapy.Field()
urladdr = scrapy.Field()
pass
打开自己创建的py文件
# -*- coding: utf-8 -*-
import scrapy
from myfirstspjt.items import MyfirstspjtItem
class WeisuenSpider(scrapy.Spider):
name = 'weisuen'
allowed_domains = ['iqianyue.com']
#start_urls = ['http://iqianyue.com/']
start_urls = (
'http://slide.news.sina.com.cn/s/slide_1_2841_103185.html',
'http://slide.mil.news.sina.com.cn/k/slide_8_193_45192.html#p=1',
'http://news.sina.com.cn/pl/2016-09-12/doc-ifxvukhv8147404.shtml',
)
#定义了新属性url2
urls2=("http://www.jd.com",
"http://sina.com.cn",
"http://yum.iqianyue.com",
)
#重写了start_requests()方法
def start_requests(self):
#在该方法中将起始网址设置为从新属性url2中读取
for url in self.urls2:
#调用默认make_requests_from_url(url)方法生成具体请求并通过yield返回
yield self.make_requests_from_url(url)
def parse(self, response):
item=MyfirstspjtItem()
item["urlname"]=response.xpath("/html/head/title/text()")
print(item["urlname"])
pass
2.Xpath基础
提取出<h2></h2>标签对应的内容:/html/body/h2
获取该标签中的文本信息,用text()实现:/html/body/h2/text()
将所有<p>标签提取出来://p
获取代码中所有class属性值为f1的<img>标签中的内容://img[@class="f1"]
3.spider类传递
# -*- coding: utf-8 -*-
import scrapy
from myfirstspjt.items import MyfirstspjtItem
class WeisuenSpider(scrapy.Spider):
name = 'weisuen'
#此时虽然还在此定义了start_urls属性,但不起作用,因为在构造方法进行了重写
start_urls = (
'http://slide.news.sina.com.cn/s/slide_1_2841_103185.html',
'http://slide.mil.news.sina.com.cn/k/slide_8_193_45192.html#p=1',
'http://news.sina.com.cn/pl/2016-09-12/doc-ifxvukhv8147404.shtml',
)
#重写初始化方法__init__(),并设置参数myurl
def __init__(self,myurl=None,*args,**kwargs):
super(WeisuenSpider,self).__init__(*args,**kwargs)
#输出要爬的网址,对应值为接受到的参数
print("要爬的网址为:%s"%myurl)
#重新定义start_urls属性,属性值为传进来的参数值
self.start_urls=["%s"%myurl]
def parse(self, response):
item=MyfirstspjtItem()
item["urlname"]=response.xpath("/html/head/title/text()")
print("以下将显示爬取的网址的标题")
print(item["urlname"])
cmd命令下:scrapycrawl weisuen -a myurl=http://www.sina.com.cn --nolog
scrapy crawl weisuen -amyurl=http://www.csdn.net --nolog
爬取多个网站,传递多个参数到spider文件里~
# -*- coding: utf-8 -*-
import scrapy
from myfirstspjt.items import MyfirstspjtItem
class WeisuenSpider(scrapy.Spider):
name = 'weisuen'
# allowed_domains = ['iqianyue.com']
#start_urls = ['http://iqianyue.com/']
#此时虽然还在此定义了start_urls属性,但不起作用,因为在构造方法进行了重写
start_urls = (
'http://slide.news.sina.com.cn/s/slide_1_2841_103185.html',
'http://slide.mil.news.sina.com.cn/k/slide_8_193_45192.html#p=1',
'http://news.sina.com.cn/pl/2016-09-12/doc-ifxvukhv8147404.shtml',
)
#重写初始化方法__init__(),并设置参数myurl
def __init__(self,myurl=None,*args,**kwargs):
super(WeisuenSpider,self).__init__(*args,**kwargs)
#通过split()将传递进来的参数以“|”未切割符进行分隔,分隔后生成一个列表并赋值给myurllist变量
myurllist=myurl.split("|")
#通过for循环遍历该列表myurllist,并分别输出传进来要爬取的各网站
for i in myurllist:
print("要爬的网址为:%s"%i)
#重新定义start_urls属性,属性值为传进来的参数值
self.start_urls=myurllist
输出结果........................>scrapy crawl weisuen -a myurl="http://www.csdn.net|http://yum.iqianyue.com" --nolog
要爬的网址为:http://www.csdn.net
要爬的网址为:http://yum.iqianyue.com
以下将显示爬取的网址的标题
[<Selector xpath='/html/head/title/text()' data='CSDN首页-不止于代码'>]
以下将显示爬取的网址的标题
[<Selector xpath='/html/head/title/text()' data='韬云科技|国内首家企业专属云平台'>]
4.用xmlfeedspider来分析xml源
RSS是一种信息聚合技术
创建myxml项目
cmd:scrapystartproject myxml
更改items.py文件为:
import scrapy
class MyxmlItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#存储文章标题
title = scrapy.Field()
#存储对应链接
link = scrapy.Field()
#存储对应文章作者
author = scrapy.Field()
pass
查询当前可使用的爬虫模板文件
.................\myxml>scrapy genspider -l
Available templates:
basic
crawl
csvfeed
xmlfeed
要分析xml源,选择爬虫模板文件xmlfeed,然后创建myxmlspider爬虫文件,域名sina.com.cn
................\myxml>scrapy genspider -t xmlfeed myxmlspider sina.com.cn
Created spider 'myxmlspider' using template 'xmlfeed' in module:
myxml.spiders.myxmlspider
在spiders目录下找到myxmlspider.py文件,更改成:
# -*- coding: utf-8 -*-
from scrapy.spiders import XMLFeedSpider
from myxml.items import MyxmlItem
class MyxmlspiderSpider(XMLFeedSpider):
name = 'myxmlspider'
allowed_domains = ['sina.com.cn']
start_urls = ['http://sina.com.cn/feed.xml']
iterator = 'iternodes' # you can change this; see the docs
itertag = 'item' # change it accordingly
def parse_node(self, response, selector):
i = MyxmlItem()
#i['url'] = selector.select('url').extract()
#i['name'] = selector.select('name').extract()
#i['description'] = selector.select('description').extract()
return i
用xpath提取内容:
文章标题:"/rss/channel/item/title/text()"
文章链接:"/rss/channel/item/link/text()"
文章作者:"/rss/channel/item/author/text()"
# -*- coding: utf-8 -*-
from scrapy.spiders import XMLFeedSpider
from myxml.items import MyxmlItem
class MyxmlspiderSpider(XMLFeedSpider):
name = 'myxmlspider'
allowed_domains = ['sina.com.cn']
#设置要分析的XML文件地址
start_urls=['http://bolg.sina.com.cn/rss/1615888477.xml']
iterator = 'iternodes' # you can change this; see the docs
#此时将开始迭代的节点设置为第一个节点rss
itertag = 'rss' # change it accordingly
def parse_node(self, response, node):
i = MyxmlItem()
#i['url'] = selector.select('url').extract()
#i['name'] = selector.select('name').extract()
#i['description'] = selector.select('description').extract()
#利用XPath表达式将对应信息提取出来,并存储到对应的Item中
i['title'] = node.xpath("/rss/channel/item/title/text()").extract()
i['link'] = node.xpath("/rss/channel/item/link/text()").extract()
i['author'] = node.xpath("/rss/channel/item/author/text()").extract()
#通过for循环遍历出提取出来存在item中信息并输出
for j in range(len(i['title'])):
print("第"+str(j+1)+"篇文章")
print("标题是:")
print(i['title'][j])
print("对应的链接是:")
print(i['link'][j])
print("对应的作者是:")
print(i['author'][j])
print("----------------------------")
return i
写好的xml文件:http://yum.iqianyue.com/weisuenbook/pyspd/part12/test.xml创建新爬虫文件:
..................\myxml>scrapy genspider -t xmlfeed person iqianyue.com
Created spider 'person' using template 'xmlfeed' in module:
myxml.spiders.person
修改爬虫文件代码:
# -*- coding: utf-8 -*-
from scrapy.spiders import XMLFeedSpider
from myxml.items import MyxmlItem
class PersonSpider(XMLFeedSpider):
name = 'person'
allowed_domains = ['iqianyue.com']
start_urls = ['http://yum.iqianyue.com/weisuenbook/pyspd/part12/test.xml']
iterator = 'iternodes' # you can change this; see the docs
itertag = 'person' # change it accordingly
def parse_node(self, response, selector):
i = MyxmlItem()
#i['url'] = selector.select('url').extract()
#i['name'] = selector.select('name').extract()
#i['description'] = selector.select('description').extract()
i['link'] = selector.xpath('/person/email/text()').extract()
print(i['link'])
return i
cmd输出:
................\myxml>scrapy crawl person --nolog
['qiansyy@iqianyue.com', 'ming@iqianyue.com']