python显示dgn_python系列整理---同一站点pattern自动生成

# coding=utf-8

import jieba

class GenePattern:

def __init__(self):

self.is_same_vector_flag = True

self.is_same_prefix_flag = True

self.domain_end_pos = 0

self.delimiter = '?'

@staticmethod

def cut_words(s, split=None):

if not split:

words = list(jieba.cut(s))

else:

words = s.split(split)

return words

def is_same_vector(self, words):

all_length = 0

split_length = 0

self.is_same_vector_flag = True

self.is_same_prefix_flag = True

for word in words:

word = word.split(self.delimiter)[0].split('//')[-1]

if not all_length and not split_length:

all_length = len(self.cut_words(word))

split_length = len(self.cut_words(word, '/'))

continue

v_b = all_length != len(self.cut_words(word))

i_b = split_length != len(self.cut_words(word, '/'))

if v_b or i_b:

self.is_same_vector_flag = False

if i_b:

print(f'words split by "/", but is not equal: {words}')

self.is_same_prefix_flag = False

break

def gene_pattern(self, urls):

if not urls:

return ''

urls = urls if isinstance(urls, list) else [urls]

if len(urls) == 1:

domain = urls[0].split('//')[-1].split('/')[0]

return f'https?://{domain}/.*'

self.delimiter = '#' if '#' in urls[0] else '?'

self.is_same_vector(urls)

vectors = self.gene_vectors(urls)

prefix_vectors = [vector[0] for vector in vectors]

suffix_vectors = [vector[1] for vector in vectors]

prefix_pattern = self.gene_prefix_pattern(prefix_vectors)

suffix_pattern = self.gene_params_pattern(suffix_vectors)

delimiter = '' if not suffix_pattern else f'\{self.delimiter}'

return f'https?://{prefix_pattern}{delimiter}{suffix_pattern}'

def gene_vectors(self, urls):

vectors = []

for url in urls:

url_split = url.split(self.delimiter)

# ? 后部分

if len(url_split) == 1:

all_prefix, all_suffix = url_split[0], ''

params = []

else:

all_prefix, all_suffix = url_split[0], url_split[1]

params = [item.split('=') for item in all_suffix.split('&')]

# ?前部分

prefix = all_prefix.split('//')[-1]

if self.is_same_vector_flag and self.is_same_prefix_flag:

words = self.cut_words(prefix)

else:

prefix_vector = self.cut_words(prefix, split='/')

prefix_words = self.cut_words(prefix_vector[0], split='.')

self.domain_end_pos = len(prefix_words)

# if self.is_same_prefix_flag:

words = prefix_words + prefix_vector[1:]

# else:

# words = prefix_words + ['.*']

vectors.append((words, params))

return vectors

def gene_prefix_pattern(self, vectors):

words_cut_map = {}

for words in vectors:

for index, word in enumerate(words):

if index not in words_cut_map:

words_cut_map[index] = []

words_cut_map[index].append(word)

pattern_items = []

index_same_part = -1

for index, vector in words_cut_map.items():

if len(set(vector)) == 1:

if index - index_same_part == 1:

pattern_items.append(vector[0])

index_same_part += 1

else:

pattern_items.append('__all__')

break

else:

if self.is_same_vector_flag:

pattern_items.append('\\w+')

else:

pattern_items.append('[^/]+')

escape = ['-', '+', '=', '.', '?', '(', ')']

pattern_items = [f'\\{word}' if word in escape else word

for word in pattern_items]

if self.is_same_vector_flag:

prefix_pattern = ''.join(pattern_items)

else:

domain = pattern_items[:self.domain_end_pos]

suffix = pattern_items[self.domain_end_pos:]

prefix_pattern = '\\.'.join(domain) + '/' + '/'.join(suffix)

return prefix_pattern.replace('__all__', '.*')

def gene_params_pattern(self, vectors):

if not self.is_same_prefix_flag:

return ''

params_dict = {}

for vector in vectors:

for k, v in vector:

if k not in params_dict:

params_dict[k] = []

params_dict[k].append(v)

params_list = []

for k, v_vector in params_dict.items():

if len(v_vector) >= 2 and len(set(v_vector)) == 1:

s = f'{k}={v_vector[0]}'

else:

s = f'{k}=[^\\=&]+'

params_list.append(s)

return '&'.join(params_list)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值