前言
Scrapy抓取慕课网免费以及实战课程信息,相关环境列举如下:
- scrapy v1.5.1
- redis
- psycopg2 (操作并保存数据到PostgreSQL)
数据表
完整的爬虫流程大致是这样的:分析页面结构 -> 确定提取信息 -> 设计相应表结构 -> 编写爬虫脚本 -> 数据保存入库;入库可以选择mongo这样的文档数据库,也可以选择mysql这样的关系型数据库。废话不多讲,这里暂且跳过页面分析,现给出如下两张数据表设计:
-- ----------------------------
-- Table structure for tb_imooc_course
-- ----------------------------
DROP TABLE IF EXISTS "public"."tb_imooc_course";
CREATE TABLE "public"."tb_imooc_course" (
"id" serial4,
"course_id" int4 NOT NULL,
"name" varchar(100) COLLATE "pg_catalog"."default" NOT NULL,
"difficult" varchar(30) COLLATE "pg_catalog"."default" NOT NULL,
"student" int4 NOT NULL,
"desc" varchar(250) COLLATE "pg_catalog"."default" NOT NULL,
"label" varchar(50) COLLATE "pg_catalog"."default" NOT NULL,
"image_urls" varchar(250) COLLATE "pg_catalog"."default" NOT NULL,
"detail" varchar(250) COLLATE "pg_catalog"."default" NOT NULL,
"duration" varchar(50) COLLATE "pg_catalog"."default" NOT NULL,
"overall_score" float4,
"content_score" float4,
"concise_score" float4,
"logic_score" float4,
"summary" varchar(800) COLLATE "pg_catalog"."default",
"teacher_nickname" varchar(30) COLLATE "pg_catalog"."default",
"teacher_avatar" varchar(250) COLLATE "pg_catalog"."default",
"teacher_job" varchar(30) COLLATE "pg_catalog"."default",
"tip" varchar(500) COLLATE "pg_catalog"."default",
"can_learn" varchar(500) COLLATE "pg_catalog"."default",
"update_time" timestamp(6) NOT NULL,
"create_time" timestamp(6) NOT NULL
)
;
COMMENT ON COLUMN "public"."tb_imooc_course"."id" IS '自增主键';
COMMENT ON COLUMN "public"."tb_imooc_course"."course_id" IS '课程id';
COMMENT ON COLUMN "public"."tb_imooc_course"."name" IS '课程名称';
COMMENT ON COLUMN "public"."tb_imooc_course"."difficult" IS '难度级别';
COMMENT ON COLUMN "public"."tb_imooc_course"."student" IS '学习人数';
COMMENT ON COLUMN "public"."tb_imooc_course"."desc" IS '课程描述';
COMMENT ON COLUMN "public"."tb_imooc_course"."label" IS '分类标签';
COMMENT ON COLUMN "public"."tb_imooc_course"."image_urls" IS '封面图片';
COMMENT ON COLUMN "public"."tb_imooc_course"."detail" IS '详情地址';
COMMENT ON COLUMN "public"."tb_imooc_course"."duration" IS '课程时长';
COMMENT ON COLUMN "public"."tb_imooc_course"."overall_score" IS '综合评分';
COMMENT ON COLUMN "public"."tb_imooc_course"."content_score" IS '内容实用';
COMMENT ON COLUMN "public"."tb_imooc_course"."concise_score" IS '简洁易懂';
COMMENT ON COLUMN "public"."tb_imooc_course"."logic_score" IS '逻辑清晰';
COMMENT ON COLUMN "public"."tb_imooc_course"."summary" IS '课程简介';
COMMENT ON COLUMN "public"."tb_imooc_course"."teacher_nickname" IS '教师昵称';
COMMENT ON COLUMN "public"."tb_imooc_course"."teacher_avatar" IS '教师头像';
COMMENT ON COLUMN "public"."tb_imooc_course"."teacher_job" IS '教师职位';
COMMENT ON COLUMN "public"."tb_imooc_course"."tip" IS '课程须知';
COMMENT ON COLUMN "public"."tb_imooc_course"."can_learn" IS '能学什么';
COMMENT ON COLUMN "public"."tb_imooc_course"."update_time" IS '更新时间';
COMMENT ON COLUMN "public"."tb_imooc_course"."create_time" IS '入库时间';
COMMENT ON TABLE "public"."tb_imooc_course" IS '免费课程表';
-- ----------------------------
-- Indexes structure for table tb_imooc_course
-- ----------------------------
CREATE UNIQUE INDEX "uni_cid" ON "public"."tb_imooc_course" USING btree (