数据爬取(scrapy)
# 创建工程
scrapy startproject lianjia
cd lianjia
scrapy genspider gethouse www.xxx.com
items.py
import scrapy
class LianjiaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
community = scrapy.Field()
area = scrapy.Field()
house_type = scrapy.Field()
acreage = scrapy.Field()
price = scrapy.Field()
gethouse.py
import scrapy
from ..items import LianjiaItem
import re
page = 100
areas = ['dongcheng', 'xicheng', 'haidian', 'chaoyang']
class GethouseSpider(scrapy.Spider):
name = 'gethouse'
# allowed_domains = ['www.xxx.com']
start_urls = []
for area in areas:
for i in range(1, page+1):
start_urls.append('https://bj.lianjia.com/zufang/{}/pg{}/'.format(area, i))
def parse(self, response)