items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class MkwItem(scrapy.Item):
link = scrapy.Field() # 图片
type = scrapy.Field() # 属于
title = scrapy.Field() # 标题
level = scrapy.Field() # 级别
num = scrapy.Field() # 人数
info = scrapy.Field() # 简介
price = scrapy.Field() # 价格
spiders/**.py
# -*- coding: utf-8 -*-
import scrapy
from .. import items
class MukeSpider(scrapy.Spider):
name = 'muke'
allowed_domains = ['imooc.com']
start_urls = ['http://www.imooc.com/course/list/']
def parse(self, response):
item = items.MkwItem()
for i in range(len(response.xpath('//*[@id="main"]/div[2]/div[2]/div[1]/div/div'))):
long = response.xpath('//*[@id="main"]/div[2]/div[2]/div[1]/div/div[{}]'.format(i + 1))
item['link'] = long.xpath('.//a/div[1]/img/@data-original').extract()[0]
type_l = ''
for i in long.xpath('.//a/div[1]/div/label/text()').extract():
type_l = type_l + i + '+'
item['type'] = type_l
item['title'] = long.xpath('.//a/div[2]/h3/text()').extract()[0]
item['level'] = long.xpath('.//a/div[2]/div/div[1]/span[1]/text()').extract()[0]
item['num'] = long.xpath('.//a/div[2]/div/div[1]/span[2]/text()').extract()[0]
item['info'] = long.xpath('.//a/div[2]/div/p/text()').extract()[0]
item['price'] = long.xpath('.//a/div[2]/div/div[2]/div/span/text()').extract()[0]
yield item
url = response.xpath("//*[@id=\"main\"]/div[2]/div[2]/div[2]/a[text()='下一页']/@href").extract()
if url:
# 将信息组合成下一页的url
page = 'http://www.imooc.com' + url[0]
# 返回url
yield scrapy.Request(page, callback=self.parse)
pipeline.py
# -*- coding: utf-8 -*-
import pymysql
class MkwPipeline(object):
def __init__(self):
self.connect = pymysql.connect(host='88.88.88.88', user='8888', passwd='8888',
db='8888') # 后面三个依次是数据库连接名、数据库密码、数据库名称
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
# sql语句
insert_sql = "insert into mkw(link, type, title, level, num, info, price) VALUES (%s, %s, %s, %s, %s, %s, %s)"
# 执行插入数据到数据库操作
self.cursor.execute(insert_sql, (
item['link'], item['type'], item['title'], item['level'], item['num'], item['info'], item['price']))
# 提交,不进行提交无法保存到数据库
self.connect.commit()
def close_spider(self, spider):
# 关闭游标和连接
self.cursor.close()
self.connect.close()
settings.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 32
DOWNLOAD_DELAY = 0.1
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
ITEM_PIPELINES = {
'mkw.pipelines.MkwPipeline': 300,
}