import re
import urllib.request
import urllib
from collections import deque
append_url = '网址'
# count_tip = 0
queue = deque()
is_visit = set()
# movies = {}
# movie = []
f = open('D:\\1.txt', 'w+')
count = 0
# 入口页面
url = '网址'
queue.append(url)
while queue:
# 只要获取20条网址的内容
if count > 20:
break
# 出队列
url = queue.popleft()
# print(url)
# 定义一个判断是否浏览过的变量,把出列的url放入到改变量中
is_visit |= {url}
# 计算访问了多少网址
count += 1
try:
url_open = urllib.request.urlopen(url, timeout=2)
if count <= 1:
data = url_open.read().decode('GBK')
else:
data = url_open.read().decode('utf-8')
except:
continue
# 获取链接和内容的正则表达式
link = re.compile('href=\"(\/p\/.+?)\"')
talk = re.compile('\<cc\>\<.+?\>(.+?)\<')
for x in link.findall(data):
# 找到的地址放入到队列中
x = append_url + x
if x not in is_visit:
queue.append(x)
try:
# 找到的内容放入到文档中
for y in talk.findall(data):
f.write(y+'\n')
except:
continue
# link_re = re.compile('href=\"(http\:\/\/www\.zhihu\.com\/question\/.+?)\"')
# movie_re = re.compile('(\《.+?\》)')
# for x in link_re.findall(data):
# if x not in is_visit:
# queue.append(x)
# for y in movie_re.findall(data):
# if len(y) < 10:
# movie.append(y)
# for i in movie:
# if movie.count(i)>1:
# movies[i] = movie.count(i)
# sorted(movies.values())
# for r in movies:
# f.write(r)
# f.write('\t')
# f.write(str(movies[r])+'\n')
f.close()
# print(movies)
python 3 简单爬虫
最新推荐文章于 2024-07-23 14:36:35 发布