1.首先新建一个项目,命令如下所示;
scapy startproject test1
cd test1
接下来
scrapy genspider sports www.sports.qq.com
如下图:
:
sports..py:
import scrapy
from scrapy import Spider, Request
from urllib.parse import urlencode
import json
from test1.items import sportsItem
class sportsSpider(Spider):
name = 'sports'
allowed_domains = ['www.sports.qq.com']
base_url = 'https://matchweb.sports.qq.com/team/baseInfo?'
def start_requests(self):
data = {
'competitionId': '100000',
}
base_url = 'https://matchweb.sports.qq.com/team/baseInfo?'
for page in range(1,31):
data['teamId']=page
params = urlencode(data)
url=base_url+params
yield scrapy.Request(url,self.parse)
def parse(self, response):
result = json.loads(response.text)
ite = result['data']['baseInfo']
item=sportsItem()
item['coach']=ite['coach']
item['joinNBADate']=ite['joinNBADate']
item['brief']=ite['brief']
item['venue']=ite['venue']
item['city']=ite['city']
item['cnName']=ite['cnName']
yield item
items.py:
from scrapy import Item, Field
class sportsItem(Item):
coach = Field()
joinNBADate = Field()
brief = Field()
venue = Field()
city = Field()
cnName = Field()
pipelines.py:
import pymongo
class MongoPipeline:
def __init__(self,mongo_url,mongo_db):
self.mongo_url=mongo_url
self.mongo_db=mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_url=crawler.settings.get('MONGO_URL'),
mongo_db=crawler.settings.get('MONGO_DATABASE'),
)
def open_spider(self,spider):
self.client=pymongo.MongoClient(self.mongo_url)
self.db=self.client[self.mongo_db]
self.post=self.db['kk']
# self.mycol = self.db["nba1"]
def close_spider(self,spider):
self.client.close()
def process_item(self,item,spider):
self.post.insert_one(dict(item))
def close_spider(self,spider):
self.client.close()
最后是一些配置:(settings.py)
BOT_NAME = 'test1'
SPIDER_MODULES = ['test1.spiders']
NEWSPIDER_MODULE = 'test1.spiders'
ROBOTSTXT_OBEY =False
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
'Host': 'matchweb.sports.qq.com',
'Referer': 'https://sports.qq.com/'
}
ITEM_PIPELINES = {
'test1.pipelines.MongoPipeline': 300,
}
MONGO_URL='localhost'
MONGO_DATABASE='nba'
最后运行scrapy crawl sports
数据如下图: