前置准备:【需要的软件】:PyCharm,Mysql,Navicat(这个可有可无)
【需要的知识】:会下载Scrapy(注意解释器),会打开mysql
先建包,PyCharm终端里敲三个命令三次回车
scrapy startproject work
cd work
scrapy genspider bei_bus beijing.8684.cn
打开新建的文件
bei_bus.py代码
from urllib.parse import urljoin
import scrapy
from scrapy import FormRequest, Request
from work.items import WorkItem
class BeiBusSpider(scrapy.Spider):
name = 'bei_bus'
allowed_domains = ['beijing.8684.cn']
start_urls = 'http://beijing.8684.cn/'
def start_requests(self):
for page in range(2):
url = '{url}/list{page}'.format(url=self.start_urls, page=(page + 1))
yield FormRequest(url, callback=self.parse_index)
def parse_index(self, response):
hrefs = response.xpath("//div[@class='cc-content service-area']/div[@class='list clearfix']/a/@href").extract()
for href in hrefs:
url2 = urljoin(self.start_urls, href)
yield Request(url2, callback=self.parse_detail)
def parse_detail(self, response):
name = response.xpath('//h1[@class="title"]/text()').extract_first()
type = response.xpath('//a[@class="category"]/text()').extract_first()
time = response.xpath('//ul[@class="bus-desc"]/li[1]/text()').extract_first()
trip = response.xpath('//div[@class="trip"]/text()').extract()
luxian = response.xpath('//div[@class="bus-lzlist mb15"]/ol/li/a/text()').extract()
bus_item = WorkItem()
for field in bus_item.fields:
bus_item[field] = eval(field)
yield bus_item
settings.py代码
BOT_NAME = 'work'
SPIDER_MODULES = ['work.spiders']
NEWSPIDER_MODULE = 'work.spiders'
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36 Edg/94.0.992.47'
}
ITEM_PIPELINES = {
'work.pipelines.WorkPipeline': 300,
}
DB_HOST='localhost'
DB_USER='root' //你的用户名
DB_PWD='123456' //你的密码
DB_CHARSET='UTF8'
DB='busdb'
pipelines.py代码
import pymysql
from itemadapter import ItemAdapter
from work import settings
class WorkPipeline:
def __init__(self):
self.host = settings.DB_HOST
self.user = settings.DB_USER
self.pwd = settings.DB_PWD
self.db = settings.DB
self.charset = settings.DB_CHARSET
self.connect()
def connect(self):
self.conn = pymysql.Connect(host=self.host,
user=self.user,
password=self.pwd,
db=self.db,
charset=self.charset)
self.cursor = self.conn.cursor()
def close_spider(self, spider):
self.conn.close()
self.cursor.close()
def process_item(self, item, spider):
sql = 'insert into businfo2(name,type,time,trip,luxian) values("%s","%s","%s","%s","%s")' % (item['name'], item['type'], item['time'], item['trip'], item['luxian'])
self.cursor.execute(sql)
self.conn.commit()
return item
items.py代码
import scrapy
class WorkItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
type = scrapy.Field()
time = scrapy.Field()
trip = scrapy.Field()
luxian = scrapy.Field()
最后建表
用Navicat查看,最终效果图:
注意:settings文件有些是在注释基础上改动的,User-Agent从自己浏览器找。