import urllib.request
import ssl
import os
import re
from collections import deque
def writeFileByte(htmlBytes,toPath):
with open(toPath,"wb") as f:
f.write(htmlBytes)
def writeFileStr(htmlBytes,toPath):
with open(toPath,"w") as f:
f.write(str(htmlBytes))
def getHtmlBytes(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
}
# 请求头
req = urllib.request.Request(url, headers=headers)
context=ssl._create_unverified_context()
# 请求体
response = urllib.request.urlopen(req,context=context)
return response.read()
#爬取网页信息+文件存储目录文件
def qqCrawler(url,toPath):
htmlBytes=getHtmlBytes(url)
writeFileByte(htmlBytes,r"存文件目录\file1.html")
writeFileStr(htmlBytes,r"存文件目录\file2.txt")
htmlStr=str(htmlBytes)
#找qq号
#pat=r'</li>\n<li class="d_name" data-field=(.*?)}'
pat=r"[1-9]\d{4,9}"
re_qq=re.compile(pat)
qqList=re_qq.findall(htmlStr)
#去重复
qqList=list(set(qqList))
f=open(toPath,"a")
for qqStr in qqList:
f.write(qqStr+"\n")
f.close()
#找网址
pat1=r'(((http|ftp|https?)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?)'
re_url=re.compile(pat1)
urlsList=re_url.findall(htmlStr)
#去重
urlsList=list(set(urlsList))
return urlsList
def center(url,toPath):
queue=deque()
queue.append(url)
while len(queue)!=0:
targetUrl=queue.popleft()
urlList=qqCrawler(targetUrl,toPath)
for item in urlList:
tempUrl=item[0]
queue.append(tempUrl)
#爬取地址
url="http://xxx.xxx.com"
#QQ存储目录文件
toPath=r"文件目录\qqFile.txt"
center(url,toPath)
python网页信息爬取,爬取QQ号
最新推荐文章于 2022-12-30 15:44:22 发布