import scrapy, json, requests
from bs4 import BeautifulSoup
import pandas as pd
item = WasdeItem()
class WasdeSpider(scrapy.Spider):
name = 'WASDE'
allowed_domains = ['usda.library.cornell.edu']
# 动态生成初始 URL
def start_requests(self):
# start_urls = []
for i in range(1,12):
url = 'https://usda.library.cornell.edu/concern/publications/3t945q76s?locale=en&page={page}#release-items'.format(page=i)
yield scrapy.Request(url=url, callback=self.parse)
# 解析每个 start_requests 中的初始 URL
def parse(self, response):
html = response.body
soup = BeautifulSoup(html, 'lxml')
trs = soup.select('#release-items > tr')
for tr in trs:
a_s = tr.select('a.file_download')
for a in a_s:
release_date = a.get('data-release-date').strip()
Scrapy 动态生成初始 URL
最新推荐文章于 2023-01-31 07:47:29 发布