最开始学的爬虫就是小甲鱼的scrapy爬虫,不过当时都对pip install都没搞太懂
所以学的是一头雾水,后来发现了可以用requests和美味汤来进行爬虫,用这个爬虫不要太简单啊,和scrapy爬虫比起来
后来听说scrapy相比requests有优势吧
那回头又来学习了scrapy爬虫
首先还是在安装scrapy就卡了半天,按装scrapy需要先安装很多的包,不然一直会报错
安装完成scrapy以后
crtl + r先运行cmd
scrapy startproject tets1
cd test1
scrapy genspider vihu "www,zhihu.com"
本来是想爬知乎的,但是后来不会登陆,最后还是去了链家
上代码
items.py
import scrapy
class Test4Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
address = scrapy.Field()
houseInfo = scrapy.Field()
positionIcon = scrapy.Field()
blank = scrapy.Field()
starIcon = scrapy.Field()
totalPrice = scrapy.Field()
unitPrice = scrapy.Field()
whvi = scrapy.Field()
import json
import codecs
class Test4Pipeline(object):
def __init__(self):
self.file = codecs.open('vihu.json', 'w', 'utf-8')
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.file.write(lines)
return item
def close_spider(self, spider):
self.file.close()
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
ITEM_PIPELINES = {
'scrapydemo.pipelines.ScrapydemoPipeline': 300,
}
ITEM_PIPELINES = {
'test4.pipelines.Test4Pipeline': 300,
}
# -*- coding: utf-8 -*-
import scrapy
import os
import sys
sys.path.append("C:/Users/zhang-peng/scrapy/test4")
from test4.items import Test4Item
class VihuSpider(scrapy.Spider):
name = 'vihu'
allowed_domains = ['https://wx.lianjia.com/ershoufang/xinwu/']
start_urls = []
host = 'https://wx.lianjia.com/ershoufang/xinwu/pg'
count = 1
while count <101:
url = host + str(count)
start_urls.append(url)
count = count+1
def parse(self, response):
teacher_list = response.xpath("//div[@class='info clear']")
for each in teacher_list:
item = Test4Item()
title = each.xpath("./div[@class='title']/a/text()").extract()
address = each.xpath("./div[@class='address']/div[@class='houseInfo']/text()").extract()
houseInfo = each.xpath("./div[@class='address']/div[@class='houseInfo']/a/text()").extract()
positionIcon = each.xpath('./div[@class="flood"]/div[@class="positionInfo"]/text()').extract()
blank = each.xpath('./div[@class="flood"]/div[@class="positionInfo"]/a/text()').extract()
totalPrice = each.xpath('./div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()').extract()
unitPrice = each.xpath('./div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()').extract()
starIcon = each.xpath('./div[@class="followInfo"]/text()').extract()
whvi = each.xpath("./div[@class='title']/a/@href").extract()
item['title'] = title[0].split()
item['address'] = address[0].split()
item['houseInfo'] = houseInfo[0].split()
item['positionIcon'] = positionIcon[0].split()
item['blank'] = blank[0].split()
item['totalPrice'] = totalPrice[0].split()
item['unitPrice'] = unitPrice[0].split()
item['starIcon'] = starIcon[0].split()
item['whvi'] = whvi[0].split()
yield item