MD5是文件的“数字指纹”,加密作用很多:校验文件、校验密码等
比如对爬虫爬到的url进行加密:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector,HtmlXPathSelector
import sys,io,hashlib
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['aihami.com']
start_urls = ['http://www.aihami.com/']
def parse(self, response):
#使用Xpath进行解析
# hxs = Selector(response).xpath("//div[@id='block_id_1852']").extract()
# hxs = Selector(response).xpath("//div[@id='block_id_1852']/ul/li/a")
#hxs = Selector(response).xpath("//li/a/@href").extract()
# hxs = Selector(response).xpath("//a[starts-with(@href,'http://ent')]/@href").extract() #href以http://ent开始
hxs = Selector(response).xpath("//a[re:test(@href,'http://ent.aihami.com/\w+/\d+/\d+.html')]/@href").extract() #href以http://ent开始,使用正则表达式
url_set = set()
for url in hxs:
# print(item.xpath(".//text()").extract_first()) #获取a标签文本列表中第一个
# print(item.xpath(".//@href").extract_first()) #获取a标签文本列表中第一个
md5_url = self.md5(url)
if md5_url in url_set:
print("URL已存在",url)
else:
#未来使用md5作为查询链表
url_set.add(md5_url)
print(url,md5_url)
#md5加密
def md5(self,url):
obj = hashlib.md5()
obj.update(bytes(url,encoding='utf-8'))
return obj.hexdigest()
爬到的结果和加密文: