SequenceMatcher python

最新推荐文章于 2024-07-23 13:23:32 发布

sinat_15355869

最新推荐文章于 2024-07-23 13:23:32 发布

阅读量2.7k

点赞数

分类专栏：基础算法

本文链接：https://blog.csdn.net/sinat_15355869/article/details/92101821

版权

基础算法专栏收录该内容

65 篇文章 0 订阅

订阅专栏

最近遇到类似：Maximum Length of Repeated Subarray 问题：

发现两类解法，都认真看了一波，有点意思！

1）LeetCode 781 --> 很多做法是基于创建二维数组 --> 动态规划 --> 思路清晰，代码简短而又??！

问题：

Input:
A: [1,2,3,2,1]
B: [3,2,1,4,7]
Output: 3
Explanation: 
The repeated subarray with maximum length is [3, 2, 1].

解答：

class Solution:
    def findLength(self, A: List[int], B: List[int]) -> int:
        
        dp = [ [0 for _ in range(len(A) + 1)] for _ in range(len(B) + 1)]

        for i in range(1, len(B)+1):
            for j in range(1, len(A)+1):
                if B[i-1] == A[j-1]:
                    dp[i][j] = dp[i-1][j-1] + 1

        return max(max(row) for row in dp)

2）下面看下 python 内置库中是如何做的，

2.1 把B -> 变化为 {value:[idx1, idx2..]} value是指B中的每一个元素，idx1是指B序列中的位置 
2.2 j2len 中存储匹配成功的元素，以及对于的idx -》 j  --》 每次会跟新 newj2len 字典判断是否是连续的
2.3 可以加入junk操作，自定义
2.4 熟悉使用 collection 中的 namedtuple
2.5 熟悉使用 dict 中的 b2j.setdefault(v, []) & b2j.get(a[i], nothing) 操作
2.6 利用 k > bestsize 可以过滤出最大的 bestsize

代码：

# @TIME : 2019/6/15 上午9:21
# @File : my_SequenceMatcher.py
# ----------------------------------------------------------------

# 学习来自python库, diff
# -> SequenceMatch: (未使用二维数组,使用一维数组,字典，双指针over)
# -> 还可以指定 isjunk字符，设置是否跳过

from collections import namedtuple as _namedtuple
import time

Match = _namedtuple('Match', 'a b size')

class SequenceMatcher:

    def __init__(self, isjunk=None, a='', b='', autojunk=True):
        self.isjunk = isjunk
        self.a = self.b = None
        self.autojunk = autojunk
        self.set_seqs(a, b)
        self.Match = _namedtuple('Match', 'a b size')

    def set_seqs(self, a, b):
        self.set_seq1(a)
        self.set_seq2(b)

    def set_seq1(self, a):
        if a is self.a:
            return
        self.a = a

    def set_seq2(self, b):
        """新建dict ->  存储b序列中每个元素对应的所有位置信息{value:[value_idx1.. value_idx2..]}"""
        if b is self.b:
            return
        self.b = b
        self.__chain_b()

    def __chain_b(self):
        """双下划线内部函数 -> 构建一会用的字典"""
        b = self.b
        self.b2j = b2j = {}

        for idx, v in enumerate(b):
            # 这个indices 是对应的 v 取出来的值，没有的话默认 []
            indices = b2j.setdefault(v, [])  #如果b2j字典中没有v这个key，那就设置一波默认值[], 如果有取出它的value，返回给indices，用来添加idx
            indices.append(idx)

        self.bjunk = junk = set()
        isjunk = self.isjunk
        # 如果存在有junk字典，需要先把他们从b2j中删除
        if isjunk:
            for elt in b2j.keys():
                if isjunk(elt):
                    junk.add(elt)
            for elt in junk:
                del b2j[elt]

        # 清除popular元素
        self.bpopular = popular = set()
        n = len(b)
        if self.autojunk and n >= 200:
            ntest = n // 100 + 1
            for elt, idxs in b2j.items():
                if len(idxs) > ntest:
                    popular.add(elt)

            for elt in popular:
                del b2j[elt]

    def find_longest_match(self, a, b):
        alo = 0
        ahi = len(a)
        blo = 0
        bhi = len(b)

        a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__
        besti, bestj,bestsize = alo, blo, 0
        j2len = {}  # 有点像用二维数组动态规划解法中的k，记录第几次连续成功匹配的值
        nothing = []  # 当b2j中，也就是b中没有a中存在的元素时候，返回空

        for i in range(alo, ahi):
            j2lenget = j2len.get   #获得j2len字典对象中的方法
            newj2len = {}
            for j in b2j.get(a[i], nothing):
                if j < blo:
                    continue
                if j >= bhi:
                    break
                k = newj2len[j] = j2lenget(j-1, 0) + 1  # 如果b中某个字符(idx=j)匹配成功，看下上一个j中的len
                if k > bestsize:
                    besti, bestj, bestsize = i-k+1, j-k+1, k  # 更新指针位置，i, j 需要先减去之前的k + 1 (配上了一个)

            j2len = newj2len

        #  后面可以先省略，这里没有用到junk
        while besti > alo and bestj > blo and \
              not isbjunk(b[bestj-1]) and \
              a[besti-1] == b[bestj-1]:
            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
        while besti+bestsize < ahi and bestj+bestsize < bhi and \
              not isbjunk(b[bestj+bestsize]) and \
              a[besti+bestsize] == b[bestj+bestsize]:
            bestsize += 1

        while besti > alo and bestj > blo and \
              isbjunk(b[bestj-1]) and \
              a[besti-1] == b[bestj-1]:
            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
        while besti+bestsize < ahi and bestj+bestsize < bhi and \
              isbjunk(b[bestj+bestsize]) and \
              a[besti+bestsize] == b[bestj+bestsize]:
            bestsize = bestsize + 1

        # 可以自己控制输入
        a = 1
        return Match(besti, bestj, bestsize)



if __name__ == "__main__":

    A = [1,2,3,2,1]
    B = [3,2,1,4,7]

    rels = []
    for i in range(100):
        start = time.time()
        # for i in range(100):
        m = SequenceMatcher(None, A, B)
        end = time.time()
        temp = end-start
        rels.append(temp)

    print("average_time:",sum(rels)/100)
    # print(m.find_longest_match(A, B))

运行时间还行：平均单次时间