创建一个scrapy项目
scrapy startproject myscrapy
生成一个爬虫
scrapy genspider example example.com
启动爬虫
scrapy crawl example
生成crawlspider
scrapy genspider -t crawl example "example.com"
案例:爬取 网站的数据
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
class Spider(scrapy.Spider):
name = ''
allowed_domains = ['.com']
start_urls = ['http://.com/']
page = 1
def __init__(self):
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
self.browser = webdriver.Chrome(executable_path=r'C:\Program Files\Google\Chrome\Application\chromedriver.exe',
chrome_options=chrome_options)
# def closed(self, spider):
# print("spider closed")
# self.browser.close()
def parse(self, response):
res_div_list = response.xpath("//div[@class='recruit-list']")
for div in res_div_list:
item = {}
item["title"] = div.xpath(".....
yield scrapy.Request("https://.com/....?...="...., callback=self.detail,
meta={
"item": item
})
# res = response.xpath("/html").extract()
# print(res)
while self.page <= 0:
self.page += 1
next_url = self.start_urls[0] + "?index=" + self.page.__str__()
yield scrapy.Request(next_url, callback=self.parse) # 这个URL用callback方法处理
def detail(self, response):
item = response.meta["item"]
item["duty"] = response.xpath("//div[@class='duty-text']//li[@class='explain-item']/text()").extract()[0]
yield item