Python专题 | 全面评测对比：子串匹配索引位置各方案

JasonLiu1919

已于 2023-06-18 16:44:38 修改

阅读量139

点赞数

分类专栏： Python 文章标签： python 子串匹配文本查找

于 2023-06-18 16:40:23 首次发布

本文链接：https://blog.csdn.net/ljp1919/article/details/131273231

版权

Python 专栏收录该内容

51 篇文章 1 订阅

订阅专栏

文章对比了多种在Python中查找子串索引位置的方法，包括使用`startswith`、`finditer`、`replace`、自定义循环以及正则表达式等。通过`timeit`模块进行性能测试，发现`find_substring_indices`和`find_all_substrings`方法在效率上表现最佳。

摘要由CSDN通过智能技术生成

背景

想要查找子串test_sub在原始文本test_str的索引位置。

各方案对比

各种常见的子串匹配方案如下：

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2023/6/18 16:23
# @Author  : JasonLiu
# @File    : test_match_index.py


import pdb
from functools import reduce
from timeit import timeit
import re
import time


def find_index_startswith(test_str, test_sub):
    # using list comprehension + startswith()
    # All occurrences of substring in string
    res = [i for i in range(len(test_str)) if test_str.startswith(test_sub, i)]
    return res


def find_index_finditer(test_str, test_sub):
    res = [i.start() for i in re.finditer(test_sub, test_str)]
    return res


def find_index_replace(test_str, test_sub):
    res = []
    while (test_str.find(test_sub) != -1):
        res.append(test_str.find(test_sub))
        test_str = test_str.replace(test_sub, "*" * len(test_sub), 1)
    return res


# 最快
def find_substring_indices(string, substring):
    # Initialize an empty list to store the start indices of the substrings
    indices = []
    # Start searching for the substring from the beginning of the string
    start_index = 0
    # Continue searching until the substring is not found in the remaining part of the string
    while True:
        # Find the next occurrence of the substring starting from the current start_index
        index = string.find(substring, start_index)
        if index == -1:
            # If the substring is not found, break out of the loop
            break
        else:
            # If the substring is found, add its start index to the list of indices
            indices.append(index)
            # Update the start index to start searching for the next occurrence of the substring
            start_index = index + 1
    # Return the list of indices
    return indices


res = find_substring_indices("我是卖核弹的小男孩", "核弹")
# pdb.set_trace()


def find_all_substrings(string, substring):
    # Initialize an empty list to store the indices of all occurrences of the substring.
    indices = []
    # Set the starting index i to 0.
    i = 0
    # Use a while loop to keep searching for the substring in the string.
    while i < len(string):
        # Use the find() method to find the first occurrence of the substring in the string, starting from the current index i.
        j = string.find(substring, i)
        # If find() returns -1, it means that there are no more occurrences of the substring in the string, so break out of the loop.
        if j == -1:
            break
        # If find() returns a non-negative value, append the index of the first character of the substring to the list,
        # and update the starting index i to the next character after the end of the substring.
        indices.append(j)
        i = j + len(substring)
    # Return the list of indices.
    return indices


def find_index_finditer_reduce(test_str, test_sub):
    # using re.finditer() to find all occurrences of substring in string
    occurrences = re.finditer(test_sub, test_str)
    # using reduce() to get start indices of all occurrences
    res = reduce(lambda x, y: x + [y.start()], occurrences, [])
    return res


def find(raw_string, short_text):
    if raw_string.find(short_text) > -1:
        pass


def re_find(raw_string, short_text):
    if re.match(short_text, raw_string):
        pass


# 最快
def best_find(raw_string, short_text):
    if short_text in raw_string:
        pass


# number: stmt执行的次数，默认是1000000，100万
# print(timeit("find(string, text)", "from __main__ import find; string='lookforme'; text='look'"))
# print(timeit("re_find(string, text)", "from __main__ import re_find; string='lookforme'; text='look'"))
# print(timeit("best_find(string, text)", "from __main__ import best_find; string='lookforme'; text='look'"))

print(timeit("find_index_startswith(string, text)",
             "from __main__ import find_index_startswith; string='我是卖核弹的小男孩'; text='小男孩'"))
print(timeit("find_index_finditer(string, text)",
             "from __main__ import find_index_finditer; string='我是卖核弹的小男孩'; text='小男孩'"))
print(timeit("find_index_replace(string, text)",
             "from __main__ import find_index_replace; string='我是卖核弹的小男孩'; text='小男孩'"))
print(timeit("find_substring_indices(string, text)",
             "from __main__ import find_substring_indices; string='我是卖核弹的小男孩'; text='小男孩'"))
print(timeit("find_all_substrings(string, text)",
             "from __main__ import find_all_substrings; string='我是卖核弹的小男孩'; text='小男孩'"))
print(timeit("find_index_finditer_reduce(string, text)",
             "from __main__ import find_index_finditer_reduce; string='我是卖核弹的小男孩'; text='小男孩'"))

start_time = time.time()
find_substring_indices("我是卖核弹的小男孩", "小男孩")
end_time = time.time()
print("find_substring_indices cost=", end_time - start_time)

start_time = time.time()
res = find_all_substrings("我是卖核弹的小男孩", "小男孩")
print("res=", res)
end_time = time.time()
print("find_all_substrings cost=", end_time - start_time)

运行结果如下：

1.2356753833591938
0.8407432101666927
0.5224904119968414
0.29449306428432465
0.2958533354103565
1.0457346253097057
find_substring_indices cost= 2.86102294921875e-06
res= [6]
find_all_substrings cost= 6.9141387939453125e-06

可以看出find_substring_indices和find_all_substrings这两种方式查找最快。