python爬虫去重_爬虫如何去重?

我有一个爬成都链家网的虫子,但是链家网有流量的限制,我使用了和浏览器相同的cache和headers来绕过,每次遇到限制,就手动的识别验证码,然后虫子就继续跑起来了,但是却出现了很多重复的已经爬过的url,而且数据也没有更新,大多都是重复的数据。

核心代码如下:

class LianjiaSpider(scrapy.Spider):

name = 'lianjiaspider'

start_urls = 'http://cd.lianjia.com/ershoufang/'

cookie = trans.stringToDict()

headers = {

'Host': "cd.lianjia.com",

'Connection': 'keep-alive',

'Cache-Control': 'max-age=0',

'Upgrade-Insecure-Requests': '1',

'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'DNT': '1',

'Accept-Encoding': 'gzip, deflate, sdch, br',

'Accept-Language': 'en-US,en;q=0.8,zh;q=0.6',

}

def start_requests(self):

yield scrapy.Request(url=self.start_urls, headers=self.headers, method='GET', cookies=self.cookie, callback=self.parse)

def parse(self, response):

body = response.body.decode('utf-8')

soup = BeautifulSoup(body)

area_div = soup.select('div[data-role="ershoufang"]')

area_list = area_div[0].find_all('a')

for area in area_list:

try:

area_han = area.string # 地点

area_pin = area['href'].split('/')[2] # 拼音

area_url = 'http://cd.lianjia.com/ershoufang/{}/'.format(area_pin)

print(area_url)

yield scrapy.Request(url=area_url, headers=self.headers, cookies=self.cookie, callback=self.detail_url, meta={"id1": area_han, "id2": area_pin} )

except Exception:

pass

def get_detail_info(self, item, url): # 进入每个房源链接抓经纬度

contents = requests.get(url, headers=self.headers, cookies=self.cookie)

body = contents.content.decode('utf-8')

soup = BeautifulSoup(body)

transaction_div = soup.find('div', 'transaction')

transaction_lis = transaction_div.find_all('li')

item['last_buy_time'] = transaction_lis[2].text[4:]

item['publish_time'] = transaction_lis[0].text[4:]

regex = '''resblockPosition(.+)'''

items = re.search(regex, body)

content = items.group()[:-1] # 经纬度

longitude_latitude = content.split(':')[1]

item['location'] = longitude_latitude[1:-1]

id_regex = '''houseId(.+)'''

ids = re.search(id_regex, body)

house_id_str = ids.group()[:-1] # house id

house_id = house_id_str.split(':')[1]

item['house_id'] = house_id[1:-1]

def detail_url(self, response):

for i in range(1, 101):

url = 'http://cd.lianjia.com/ershoufang/{}/pg{}/'.format(response.meta["id2"], str(1))

time.sleep(2)

try:

print('当前正在爬取:{}'.format(url))

contents = requests.get(url, headers=self.headers, cookies=self.cookie)

body = contents.content.decode('utf-8')

soup = BeautifulSoup(body)

house_ul = soup.find('ul', 'sellListContent')

houselist = house_ul.find_all('li')

for house in houselist:

try:

item = LianjiaItem()

item['title'] = house.find('div', 'title').a.string

item['community'] = house.find('div', 'houseInfo').text.split('|')[0]

item['model'] = house.find('div', 'houseInfo').text.split('|')[1]

area_str = house.find('div', 'houseInfo').text.split('|')[2]

area_match = re.findall(r'\d+', area_str)

if len(area_match) == 2:

item['area'] = float(area_match[0] + '.' + area_match[1])

else:

item['area'] = float(area_match[0])

focus_num_str = house.find('div', 'followInfo').text.split('/')[0]

focus_num_match = re.findall(r'\d+', focus_num_str)

item['focus_num'] = focus_num_match[0]

watch_num_str = house.find('div', 'followInfo').text.split('/')[1]

watch_num_match = re.findall(r'\d+', watch_num_str)

item['watch_num'] = watch_num_match[0]

item['price'] = float(house.find('div', 'totalPrice').span.string) * 10000

average_price_str = house.find('div', 'unitPrice').span.string

average_price_match = re.findall(r'\d+', average_price_str)

item['average_price'] = average_price_match[0]

item['link'] = house.find('div', 'title').a['href']

item['city'] = response.meta["id1"]

self.get_detail_info(item, item['link'])

except Exception as e:

print(str(e))

pass

yield item

except Exception:

pass

不知道是不是该使用redies_spider, 或者不知道大家有没有更好的方式,来做数据或者url去重?

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值