1、scrapy 数据爬取框架
安装Anaconda环境
安装依赖库
构建项目
编写实现代码
2、scrapy-redis 分布式爬取
安装依赖
修改配置文件
多开终端模拟分布式爬取
3、编写Dockerfile
# Set Base OS FROM centos:7.6.1810 # Set Creater MAINTAINER xxx <xxx@xxx.com> # Install Basic Dependencies RUN yum install -y initscripts RUN yum install -y crontabs RUN yum install -y wget RUN yum install -y bzip2 # Download Anaconda Python RUN wget --quiet https://repo.anaconda.com/archive/Anaconda3-2019.03-Linux-x86_64.sh -O ~/anaconda.sh # Install Anaconda Python RUN /bin/bash ~/anaconda.sh -b -p /opt/conda # Remove Anaconda SH File RUN rm ~/anaconda.sh # Add Env RUN echo "export PATH=/opt/conda/bin:$PATH" >> ~/.bashrc #安装爬虫依赖 RUN /opt/conda/bin/conda install -y scrapy RUN /opt/conda/bin/pip install scrapy-redis RUN /opt/conda/bin/conda install -y pymongo # Set timezone RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime # Set crontab task COPY crontab var/spool/cron/root # Set locale ENV LANG C.UTF-8 LC_ALL=C.UTF-8
4、构建容器(未做版本管理)
#!/bin/bash result=$(docker ps | grep "spider_car") if [[ "$result" != "" ]] then echo "stop spider_car" docker stop spider_car fi result1=$(docker ps -a | grep "spider_car") if [[ "$result1" != "" ]] then echo "rm spider_car" docker rm spider_car fi result2=$(docker images | grep "spider_car") if [[ "$result2" != "" ]] then echo "spider_car" docker rmi spider_car fi docker build -t spider_car . # 采用目录挂载的方式,这样不不需要每次都重新构建镜像 docker run -dit --privileged --name spider_car -v /root/spider/spider_py:/opt/spider_py spider_car /usr/sbin/init
5、Jenkins自动化构建