爬虫进阶:Scrapy 抓取慕课网

前言

  Scrapy抓取慕课网免费以及实战课程信息,相关环境列举如下:

  • scrapy v1.5.1
  • redis
  • psycopg2 (操作并保存数据到PostgreSQL)

数据表

  完整的爬虫流程大致是这样的:分析页面结构 -> 确定提取信息 -> 设计相应表结构 -> 编写爬虫脚本 -> 数据保存入库;入库可以选择mongo这样的文档数据库,也可以选择mysql这样的关系型数据库。废话不多讲,这里暂且跳过页面分析,现给出如下两张数据表设计:

tb_imooc_course

-- ----------------------------
-- Table structure for tb_imooc_course
-- ----------------------------
DROP TABLE IF EXISTS "public"."tb_imooc_course";
CREATE TABLE "public"."tb_imooc_course" (
  "id" serial4,
  "course_id" int4 NOT NULL,
  "name" varchar(100) COLLATE "pg_catalog"."default" NOT NULL,
  "difficult" varchar(30) COLLATE "pg_catalog"."default" NOT NULL,
  "student" int4 NOT NULL,
  "desc" varchar(250) COLLATE "pg_catalog"."default" NOT NULL,
  "label" varchar(50) COLLATE "pg_catalog"."default" NOT NULL,
  "image_urls" varchar(250) COLLATE "pg_catalog"."default" NOT NULL,
  "detail" varchar(250) COLLATE "pg_catalog"."default" NOT NULL,
  "duration" varchar(50) COLLATE "pg_catalog"."default" NOT NULL,
  "overall_score" float4,
  "content_score" float4,
  "concise_score" float4,
  "logic_score" float4,
  "summary" varchar(800) COLLATE "pg_catalog"."default",
  "teacher_nickname" varchar(30) COLLATE "pg_catalog"."default",
  "teacher_avatar" varchar(250) COLLATE "pg_catalog"."default",
  "teacher_job" varchar(30) COLLATE "pg_catalog"."default",
  "tip" varchar(500) COLLATE "pg_catalog"."default",
  "can_learn" varchar(500) COLLATE "pg_catalog"."default",
  "update_time" timestamp(6) NOT NULL,
  "create_time" timestamp(6) NOT NULL
)
;
COMMENT ON COLUMN "public"."tb_imooc_course"."id" IS '自增主键';
COMMENT ON COLUMN "public"."tb_imooc_course"."course_id" IS '课程id';
COMMENT ON COLUMN "public"."tb_imooc_course"."name" IS '课程名称';
COMMENT ON COLUMN "public"."tb_imooc_course"."difficult" IS '难度级别';
COMMENT ON COLUMN "public"."tb_imooc_course"."student" IS '学习人数';
COMMENT ON COLUMN "public"."tb_imooc_course"."desc" IS '课程描述';
COMMENT ON COLUMN "public"."tb_imooc_course"."label" IS '分类标签';
COMMENT ON COLUMN "public"."tb_imooc_course"."image_urls" IS '封面图片';
COMMENT ON COLUMN "public"."tb_imooc_course"."detail" IS '详情地址';
COMMENT ON COLUMN "public"."tb_imooc_course"."duration" IS '课程时长';
COMMENT ON COLUMN "public"."tb_imooc_course"."overall_score" IS '综合评分';
COMMENT ON COLUMN "public"."tb_imooc_course"."content_score" IS '内容实用';
COMMENT ON COLUMN "public"."tb_imooc_course"."concise_score" IS '简洁易懂';
COMMENT ON COLUMN "public"."tb_imooc_course"."logic_score" IS '逻辑清晰';
COMMENT ON COLUMN "public"."tb_imooc_course"."summary" IS '课程简介';
COMMENT ON COLUMN "public"."tb_imooc_course"."teacher_nickname" IS '教师昵称';
COMMENT ON COLUMN "public"."tb_imooc_course"."teacher_avatar" IS '教师头像';
COMMENT ON COLUMN "public"."tb_imooc_course"."teacher_job" IS '教师职位';
COMMENT ON COLUMN "public"."tb_imooc_course"."tip" IS '课程须知';
COMMENT ON COLUMN "public"."tb_imooc_course"."can_learn" IS '能学什么';
COMMENT ON COLUMN "public"."tb_imooc_course"."update_time" IS '更新时间';
COMMENT ON COLUMN "public"."tb_imooc_course"."create_time" IS '入库时间';
COMMENT ON TABLE "public"."tb_imooc_course" IS '免费课程表';

-- ----------------------------
-- Indexes structure for table tb_imooc_course
-- ----------------------------
CREATE UNIQUE INDEX "uni_cid" ON "public"."tb_imooc_course" USING btree (
  
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值