import urllib.request
import re
import pypinyin
import time
def getresources(url):
webSourceCode = urllib.request.urlopen(url).read().decode(“gbk”, “ignore”)
contentRe = re.compile(r’
content = contentRe.findall(webSourceCode)
return content
def getpage(url):
webSourceCode = urllib.request.urlopen(url).read().decode(“gbk”, “ignore”)
PageRe = re.compile(r’ (.*?)’)
page = PageRe.findall(webSourceCode)
for item in page:
page.remove(item)
item=re.sub(r’共’,’’,item)
item = re.sub(r’页’, ‘’, item)
page.append(item)
page = “”.join(page)
return page
def hp(word):
#将汉字转换为拼音
s = “”
for i in pypinyin.pinyin(word, style=pypinyin.NORMAL):
s += ‘’.join(i)
return s
data = open(“e:\jingdian2.txt”,‘w+’)
data1 = open(“e:\userdict.txt”,‘r’,encoding=‘utf-8-sig’)
list1 = []
for line in data1:
list1.append(hp(line.strip().split(’,’)[0]))
print(list1)
list2 = list1[10:]
print(list2)
count = 0
for item in list2:
time.sleep(3)
print(item)
count +=1
if count>5:
break
else:
url = f"https://" + item + “.cncn.com/jingdian/”
x = getpage(url)
print(x)
z = int(x.strip())
print(z)
for i in range(1, z + 1):
if i == 1:
url1 = f"https://" + item + “.cncn.com/jingdian/”
else:
url1 = f"https://" + item + “.cncn.com/jingdian/1-” + str(i) + “-0-0.html”
content = getresources(url1)
for i in content:
data.write(item+","+i + ‘\n’)
只敢一部分一部分地爬,一次性地爬会被网站封掉。
爬虫小白,还没有学会虚拟ip之类的操作