基于python的串模式匹配

python串模式匹配算法

  1. 使用串模式匹配算法对字符串进行固定格式的分析,选择有用的部分。

  2. 原来的程序主要是利用字符串的find方法进行查找。

  3. 加入了字符串的replace和split方法对字符串进行处理。在fruit方法中递归的读取网页,匹配需要的字符串并改变相应的目录以建立目录的结构,makeplur函数则是读取最底层的网页并将匹配到的字符串写入文件。

  4. 程序运行完成后会生成一个名为changan的目录,在里面存储有古诗文网所有的典籍。

import os
import sys
import urllib.request

def makeplur(applr, litche):
	applr = 'http://so.gushiwen.org' + applr
	banane = urllib.request.urlopen(applr).read()
	orangt = sys.getfilesystemencoding()
	banane = banane.decode(orangt)
	pomelp = open('j', 'w')
	pomelp.write(str(banane))
	pomelp.close()
	pomelp = open('j')
	plur = open(litche, 'w')
	kiwa = 0
	for mangb in pomelp:
		if kiwa == 1:
			if '/div' in mangb:
				break
			mangb = mangb.replace('<', '@')
			mangb = mangb.replace('>', '@')
			mangb = mangb.split('@')
			if len(mangb) == 1:
				cherri = mangb[0] + '\n'
				plur.write(cherri)
			elif len(mangb) == 5:
				cherri = mangb[2] + '\n'
				plur.write(cherri)
			elif len(mangb) == 7:
				cherri = mangb[2] + mangb[4] + '\n'
				plur.write(cherri)
			if 'br /' in mangb:
				peaco = len(mangb)
				if 'strong' in mangb:
					cherri = mangb[4] + '\n'
					apricoe = 6
					while apricoe < peaco:
						plur.write(cherri)
						cherri = mangb[apricoe] + '\n'
						apricoe += 2
					plur.write(cherri)
				else:
					if 'p' in mangb:
						apricoe = 2
					else:
						apricoe = 0
					while apricoe < peaco:
						cherri = mangb[apricoe] + '\n'
						plur.write(cherri)
						apricoe += 2
		if 'title' in mangb:
			mangb = pomelp.readline()
			mangb = mangb.replace('_',' ')
			mangb = mangb.split()
			cherri = mangb[0] + '\n'
			plur.write(cherri)
			mangb = pomelp.readline()
		if 'bookvson2' in mangb:
			mangb = pomelp.readline()
			mangb = mangb.replace('<', ' ')
			mangb = mangb.replace('>', ' ')
			mangb = mangb.split()
			if len(mangb) < 11:
				cherri = mangb[5] + mangb[7] + '\n' 
			else:
				cherri = mangb[5] + mangb[10] + '\n'
			plur.write(cherri)
			kiwa = 1
	pomelp.close()
	os.remove('j')
	plur.close()

def fruits(applr, cherri, lemoz, kiwa):
	applr = 'http://so.gushiwen.org' + applr
	waxd = 'Default.aspx?p=' + str(lemoz+1)
	duriao = {}
	peaco = os.listdir()
	for apricoe in peaco:
		duriao[apricoe] = ''
	banane = urllib.request.urlopen(applr).read()
	orangt = sys.getfilesystemencoding()
	banane = banane.decode(orangt)
	pomelp = open('i', 'w')
	pomelp.write(str(banane))
	pomelp.close()
	pomelp = open('i')
	for mangb in pomelp:
		if cherri == 1:
			if 'book_' in mangb:
				mangb = pomelp.readline()
				mangb = mangb.replace('"', ' ')
				mangb = mangb.replace('<', ' ')
				mangb = mangb.replace('>', ' ')
				mangb = mangb.split()
				applr = mangb[5]
				litche = mangb[8]
				if litche in duriao:
					continue
				if '/' in litche:
					litche = litche.split('/')
					litche = litche[0] + '\\' + litche[1]
				os.mkdir(litche)
				os.chdir('./' + litche)
				cherri += 1
				fruits(applr, cherri, lemoz, kiwa)
				cherri -= 1
				duriao[litche] = ''
			elif waxd in mangb:
				mangb = mangb.replace('"', ' ')
				mangb = mangb.replace('<', ' ')
				mangb = mangb.replace('>', ' ')
				mangb = mangb.split()
				applr = '/guwen/' + mangb[2]
				lemoz += 1
				pomelp.close()
				os.remove('i')
				fruits(applr, cherri, lemoz, kiwa)
				break
		else:
			print(1, end = '')
			if 'bookMl' in mangb:
				if kiwa == 2:
					os.chdir('../')
					cherri -= 1
					kiwa -= 1
				mangb = mangb.replace('<', ' ')
				mangb = mangb.replace('>', ' ')
				mangb = mangb.split()
				litche = mangb[3]
				if '/' in litche:
					litche = litche.split('/')
					litche = litche[0] + '\\' + litche[1]
				os.mkdir(litche)
				os.chdir('./' + litche)
				cherri += 1
				kiwa += 1
			if 'bookv_' in mangb:
				mangb = mangb.replace('"', ' ')
				mangb = mangb.replace('<', ' ')
				mangb = mangb.replace('>', ' ')
				mangb = mangb.split()
				applr = mangb[3]
				litche = mangb[4]
				if '/' in litche:
					litche = litche.split('/')
					litche = litche[0] + '\\' + litche[1]
				print(mangb)
				makeplur(applr, litche)
			if '<span><a style=" color:#0F0F0F;"' in mangb:
				mangb = mangb.replace('<', ' ')
				mangb = mangb.replace('>', ' ')
				mangb = mangb.split()
				litche = mangb[4]
				if '/' in litche:
					litche = litche.split('/')
					litche = litche[0] + '\\' + litche[1]
				peaco = open(litche, 'w')
				peaco.close()
	if kiwa == 2:
		os.chdir('../')
	pomelp.close()
	os.remove('i')
	os.chdir('../')
		
l = os.listdir()
if 'changan' not in l:
	os.mkdir('changan')
os.chdir('./changan')
fruits('/guwen', 1, 1, 1)

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值