爬取QQ号 import re import urllib.request import os import json import ssl from collections import deque #把爬去的数据保存到文件的函数 def writeFileBytes(htmlBytes,toPath): with open(toPath,"wb") as f: f.write(htmlBytes) def writeFileStr(htmlBytes,toPath): with open(toPath,"w") as f: f.write(str(htmlBytes)) #封装爬虫函数 def getHtmlBytes(url): headers = { "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0" } req = urllib.request.Request(url, headers=headers) # 请求体 context=ssl._create_unverified_context() #处理https请求 response = urllib.request.urlopen(req,context=context) # 发起请求 return response.read() #爬虫qq的函数 def qqCrawler(url,toPath): htmlBytes=getHtmlBytes(url) htmlStr=str(htmlBytes) #qq的正则 pat=r"[1-9]\d{4,10}" re_qq=re.compile(pat) qqList=re_qq.findall(htmlStr) qqsList=list(set(qqList)) f=open(toPath,"a") for qqStr in qqList: f.write(qqStr+"\n") f.close() #url的正则 pat=r"(((http|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1-4})*(/[a-zA-Z0-0\$%_\./-~-]*)?)" re_url=re.compile(pat) urlList=re_url.findall(htmlStr) return [qqList,urlList] def center(url,toPath): #通过队列实现 queue=deque() queue.append(url) while len(queue)!=0: targetUrl=queue.popleft() urlList=qqCrawler(targetUrl,toPath) for item in urlList: tempUrl=item[0] queue.append(tempUrl) #调用函数爬去单个页面 toPath=r"C:\Users\HP\Desktop\qqFile.txt" qqCrawler(url,toPath)