-
安装框架:pip install scrapy
-
在自定义目录下,新建一个Scrapy项目
scrapy startproject 项目名
-
编写spiders爬取网页
scrapy genspider 爬虫名称 “爬取域”
-
编写实体类
打开pycharm,编辑项目中items.py
import scrapy
class BossItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field() #对应实体类的属性
salary = scrapy.Field()
- 编写爬虫
# -*- coding: utf-8 -*-
import scrapy
from Boss.items import BossItem
class ZhipinSpider(scrapy.Spider):
name = 'zhipin' #爬虫名称
allowed_domains = ['lianjia.com'] #爬取域
start_urls = ['https://sh.lianjia.com/zufang/'] #爬取链接
def parse(self, response):
items = []
posts = response.xpath("//div[@class='content__list--item--main']")
for each in posts:
item = BossItem()
item["name"] = each.xpath("//p[@class='content__list--item--title twoline']/a/text()").extract()[0]
address = each.xpath("p[class='content__list--item--des']/a[position()<4]/text()").extract()
item["salary"] = each.xpath("span[@class='content__list--item-price']/em/text()").extract()[0]
#item["address"] = address[1]+address[2]+address[3]
print(item)
items.append(item)
# yield item
return items
#测试是否能爬取到网页
# with open("lianjia.html","w",encoding="utf-8") as file:
# file.write(response.text)
- 设置管道属性,在settings.py中添加以下代码,启动管道
ITEM_PIPELINES = {
'Boss.pipelines.BossPipeline': 300, #项目名.管道名.管道类 :时间越小优先级越高(0-1000)
}
#浏览器代理
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
- 编写管道类(用于输出文件)
import json
class BossPipeline(object):
def __init__(self):
self.file = open("lianjia.json","w",encoding="utf-8")
def process_item(self, item, spider):
content = json.dumps(dict(item),ensure_ascii = False)+"\n"
self.file.write(content)
return item
def close_spider(self,spider):
self.file.close()
- 运行爬虫项目
scrapy crawl 爬虫名称 - 也可编写启动类,在爬虫项目下,创建一个新的py文件,
from scrapy import cmdline
cmdline.execute("scrapy crawl zhipin".split())
运行结束后,会出现一个json的文件。
创建数据库表python
import json
import pymysql
import traceback
from time import sleep
class PyMySQL(object):
create_table = 'create table lianjia(id int not null primary key auto_increment,name varchar(255) not null,salary int,address varchar(255))default charset=utf8'
def __init__(self, host, user, pwd, db):
self.conn = pymysql.connect(host, user, pwd, db)
self.cursor = self.conn.cursor()
def create_table_func(self):
self.cursor.execute(PyMySQL.create_table)
print('数据表创建完毕')
def insert_date(self,sql):
try:
self.cursor.execute(sql)
self.conn.commit()
except:
print(traceback.format_exc())
self.conn.rollback()
def select_data(self):
self.cursor.execute(PyMySQL.select)
all_data = self.cursor.fetchall()
for i in all_data:
print('查询结果为:{}'.format(i))
if __name__ == '__main__':
my = PyMySQL('localhost', 'root', '123456', 'pytest')
# my.create_table_func()
with open('../lianjia.json','r',encoding='utf-8') as f:
for line in f.readlines():
print(line)
temp = json.loads(line)
name = temp['name'].strip();
salary = temp['salary']
address = temp['address']
sql = 'insert into lianjia(name,salary,address) values("%s","%s","%s")' % (name, salary, address)
my.insert_date(sql)
执行此方法将数据写到数据据库中。
相关问题