网页
$scrapy startproject mypjt
$ scrapy genspider -t basic xxx sina.com.cn
html格式
class CaoItem(scrapy.Item):
urlname = scrapy.Field()
urlkey = scrapy.Field()
urlcr = scrapy.Field()
urladd = scrapy.Field()
class AbcSpider(scrapy.Spider):
name = 'abc'
start_urls = [
'http://python.jobbole.com/',
'http://blog.csdn.net/
]
def __init__(self,myurl=None,*args,**kwargs):
super(AbcSpider, self).__init__(*args,**kwargs)
print ("要爬取的网址为: %s" %myurl)
self.start_urls=["%s" %myurl]
def parse(self,response):
item = CaoItem()
item['urlname'] = response.xpath('/html/head/title/text()').extract()
$ scrapy crawl abc --nolog -a myurl="http://mp3.baidu.com"
要爬取的网址为: http://mp3.baidu.com
百度音乐-听到极致
XMLFeedSpider
class MyxmlItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
author = scrapy.Field()
class MycsvspiderSpider(CSVFeedSpider):
name = 'mycsvspider'
allowed_domains = ['iqianyue.com']
start_urls = ['这里地址自行定义,找一个xml文档,有上述字段']
headers = ['name','sex','add','email']
delimiter = ','
def parse_row(self, response, row):
i = MycsvItem()
i['name'] = row['name'].encode()
i['sex'] = row['sex'].encode()
print("名字是:")
print (i['name'])
print ("性别是:")
print (i['sex'])
print ('------------')
return i
CSVFeedSpider
class MycsvItem(scrapy.Item):
name = scrapy.Field()
sex = scrapy.Field()
class MycsvspiderSpider(CSVFeedSpider):
name = 'mycsvspider'
allowed_domains = ['iqianyue.com']
start_urls = ['自定义一个CSV文档用逗号分割的']
headers = ['name','sex','add','email']
delimiter = ','
def parse_row(self, response, row):
i = MycsvItem()
i['name'] = row['name'].encode()
i['sex'] = row['sex'].encode()
print("名字是:")
print (i['name'])
print ("性别是:")
print (i['sex'])
print ('------------')
return i
$ scrapy craw1 mycsvspider --nolog