'''
目的:找个网页循环的爬去qq号码。
结果:有点错误,不过还是实现了中央控制器调度循环爬取得功能
'''
from urllib import request
import ssl
import re
import os
from collections import deque
def getUrlBytes(url):
# 设置请求头
headers = {
'Accept': 'text/html, application/xhtml+xml, */*',
# 'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6756.400 QQBrowser/10.3.2545.400',
'DNT': '1',
'Connection': 'Keep-Alive'
}
#设置为经过验证的上下文
context = ssl._create_unverified_context()
# 设置请求体
req = request.Request(url, headers=headers)
# 得到回复内容
response = request.urlopen(req, timeout=1,context=context)
return response.read().decode("utf-8")
def writeFileBytes(htmlBytes,topath):
with open(topath, 'wb') as fp:
fp.write(htmlBytes)
def writeFileString(htmlString,topath):
with open(topath, 'w') as fp:
fp.write(htmlString.decode("utf-8"))
def writeFileString(qqString,topath):
with open(topath, 'a+') as fp:
fp.write(str(qqString))
#fp.write(str(htmlBytes))
def qqCrawker(url, toPath):
len = 0
qqData = getUrlBytes(url)
if qqData == None:
return None
#拿到qq并且去重
re_qq = re.compile('[1-9]\d{4,9}')
qqList = re_qq.findall(qqData)
qqList = list(set(qqList))
for qq in qqList:
len += 1
writeFileString(qq.ljust(11),toPath)
#匹配http url:拿到链接地址URL
#re_url = re.compile('\b(([\w-]+://?|www[.])[^\s()<>]+(?:[\w\d]+[\w\d]+|([^[:punct:]\s]|/)))')
re_url = re.compile('(((http|ftp|https)://)(([a-zA-Z0-9\._]+\.[a-zA-Z]{2,6})|'
'([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?)')
urlList = re_url.findall(qqData)
print(len)
return urlList
url = 'https://www.douban.com/group/topic/110094603/'
toPath = r'F:\编程语言学习\Python的学习\2019年3月份的学习\qqCrawker\QQ.txt'
#qqCrawker(url, toPath)
def centerControl(url, toPath):
queue = deque()
queue.append(url)
while len(queue) != 0:
targetUrl = queue.popleft()
urlList = qqCrawker(targetUrl, toPath)
if urlList == None:
continue
for item in urlList:
#第一个位置才是url
tempUrl = item[0]
queue.append(tempUrl)
centerControl(url, toPath)
总结:爬虫部分暂时告一段落,我将继续python的基础知识学习了。等我学习完基础知识,再来补充爬虫部分的内容。经历了两天,甚至三天的时光,我终于对爬虫有了基础的了解。爬虫就是通过程序模拟浏览器去网站上自动获取我们想要的数据。看似简单,实际上许多的细节包含其中,对Web结构的分析,定义获取数据的规则(目前使用正则去定义),然后处理存入文件等,还有此处的通过中央调度器去实现循环爬取,会发现效率时非常慢的。等之后学写了多线程在回过头来修改,我将继续前行,加油!