1、创建项目
scrapy startproject ppd
2,爬取单页,主要用xpath
spider里面的源码
from scrapy.spiders import Spider
from scrapy.selector import Selector
from ppd.items import BlackItem
class PpdSpider(Spider):
name = "ppd"
allowed_domains = ["dailianmeng.com"]
start_urls = [
"http://www.dailianmeng.com/p2pblacklist/index.html"
]
def parse(self, response):
sites = response.xpath('//*[@id="yw0"]/table/tbody/tr')
items = []
for site in sites:
item = BlackItem()
item['name'] = site.xpath('td[1]/text()').extract()
item['idcard'] = site.xpath('td[2]/text()').extract()
item['mobile']=site.xpath('td[3]/text()').extract()
item['email']=site.xpath('td[4]/text()').extract()
item['total']=site.xpath('td[5]/text()').extract()
item['bepaid']=site.xpath('td[6]/text()').extract()
item['notPaid']=site.xpath('td[7]/text()').extract()
item['time']=site.xpath('td[8]/text()').extract()
item['loanAmount']=site.xpath('td[9]/text()').extract()
items