爬虫文件
细节问题参考前两章。
在settings.py中设置图片和数据库
BOT_NAME = 'houses'
SPIDER_MODULES = ['houses.spiders']
NEWSPIDER_MODULE = 'houses.spiders'
ROBOTSTXT_OBEY = False
LOG_LEVEL="WARNING" #日志为警告以上才显示
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'houses.pipelines.MysqlPipeline': 100,
'houses.pipelines.HouseImagePipeline': 200, # 图片下载模型
}
IMAGES_STORE='images' #图片路径【注意】
IMAGES_EXPIRES=90
IMAGES_MIN_HEIGHT=100
IMAGES_MIN_WIDTH=100
MYSQL_DB_HOST="127.0.0.1"
MYSQL_DB_PORT=3306 #端口
MYSQL_DB_NAME="spier"
MYSQL_DB_USER="root"
MYSQL_DB_PASSWORD="123456"
打开cmd,添加表【数据库】
cmd
C:\Users\admin>mysql -u root -p
mysql> show databases;
mysql> use spier;
mysql> create table HouseInfo(house varchar(255),address varchar(255),price varchar(255),total varchar(255))ENGINE=InnoDB DEFAULT CHARSET=utf8; //建表
pipelines.py
from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
import pymysql
class MysqlPipeline:
def open_spider(self,spider):
# 读取settings.py中的配置项
host = spider.settings.get("MYSQL_DB_HOST")
port = spider.settings.get