爬虫前的准备知识
urllib
>>> import urllib
>>> help(urllib)
Help on package urllib:
NAME
urllib
PACKAGE CONTENTS
error
parse
request
response
robotparser
urllib包说明可参见urllib python文档
主要使用request去进行url的请求
下面简单写一个请求
import urllib.request
import urllib.parse
import socket
import re
def getIp():
header = {'User-Agent': 'Mozilla/5.0'}
url = 'https://www.xicidaili.com/wt'
request = urllib.request.Request(url, headers=header)
response = urllib.request.urlopen(request)
html = response.read()
print(html.decode("utf-8"))
response.close()
if __name__ == '__main__':
getIp()
这里我们抓取的是一个ip代理网站的html内容,并进行了utf-8的解码,最后打印出来
现在需要将这些ip提取出来,需要先学习下怎么处理这整个html的字符串
正则表达式 re
在开始之前,表明下正则表达式要干什么
肯定是从字符串中找到你想要的那一部分啦
那我们看看re有什么接口
>>> import re
>>> re.__all__
['match', 'fullmatch', 'search', 'sub', 'subn', 'split', 'findall', 'finditer', 'compile', 'purge', 'template', 'escape', 'error', 'A', 'I', 'L', 'M', 'S', 'X', 'U', 'ASCII', 'IGNORECASE', 'LOCALE', 'MULTILINE', 'DOTALL', 'VERBOSE', 'UNICODE']
正则表达式的基本接口给用法如上,下面我们挑两个先实践一下,就search和findall吧
>>> help(re.search)
Help on function search in module re:
search(pattern, string, flags=0)
Scan through string looking for a match to the pattern, returning
a match object, or None if no match was found.
>>> help(re.findall)
Help on function findall in module re:
findall(pattern, string, flags=0)
Return a list of all non-overlapping matches in the string.
If one or more capturing groups are present in the pattern, return
a list of groups; this will be a list of tuples if the pattern
has more than one group.
Empty matches are included in the result.
这里由于只想拿出来字符中的ip,我们需要用到
\d+ : \d用于匹配数字[0-9],+表示匹配一次或多次
\. : 匹配 .
其余匹配字符串的方式可以看到官方文档
原汁原味的英文re说明
到时候想匹配什么,可以自己总结一个表快速查用
下面开始测试
>>> import re
>>> str = "fsalk 09u1 192.168.2.11"
>>> p = "(\d+\.){3}\d+"
>>> b = re.search(p,str)
>>> b
<_sre.SRE_Match object; span=(11, 23), match='192.168.2.11'>
>>> b.group()
'192.168.2.11'
>>> str2 = "fklz91ujlfnaoi"
>>> b = re.search(p,str2)
>>> b.group()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'NoneType' object has no attribute 'group'
这里没有匹配上的情况,返回给b是NoneType,所以要先判断再继续
>>> if b:
... print("matched")
... else:
... print("not matched")
...
not matched
这正则式不错,马上想用到findall接口上
>>> c = re.findall(p,str)
>>> c
['2.']
结果却不如人意,为啥呢,在上面的说明中有
If one or more capturing groups are present in the pattern, return
a list of groups;
而我们这里用的匹配字段是
p = "(\d+\.){3}\d+"
看来是不能简单愉快的用括号了,会被识别成子组,返回子组表
可以用(?:…)来避免子组识别
>>> p = "\d+\.\d+\.\d+\.\d+"
>>> c = re.findall(p,str)
>>> c
['192.168.2.11']
>>> p = "(?:(?:\d+)\.){3}\d+"
>>> c = re.findall(p,str)
>>> c
['192.168.2.11']
继续在上面获取IP的代码上开发
import urllib.request
import urllib.parse
import socket
import re
def getIp():
# socket.setdefaulttimeout(20) # 设置socket层的超时时间为20秒
header = {'User-Agent': 'Mozilla/5.0'}
url = 'https://www.xicidaili.com/wt'
request = urllib.request.Request(url, headers=header)
response = urllib.request.urlopen(request)
html = response.read()
response.close()
#现在用re把里面的ip全部拿出来
list_ip = test(html.decode("utf-8"))
return list_ip
def test(ipstr):
p = r'\d+\.\d+\.\d+\.\d+'
b = re.search(p, ipstr)
iplist = re.findall(p, ipstr)
return iplist
if __name__ == '__main__':
iplist = getIp()
print("提取出ip个数 : ", len(iplist))
print(iplist)
结果
提取出ip个数 : 100
['222.95.240.26', '222.223.182.66', '115.239.32.204', '220.168.86.37', '121.237.148.220', '117.88.5.6', '39.91.8.31', '218.18.158.216', '116.196.87.86', '117.88.177.157', '61.153.251.150', '119.57.108.89', '116.196.85.166', '183.172.151.150', '106.15.248.236', '116.196.85.150', '222.95.144.108', '222.95.241.216', '121.237.149.83', '222.95.144.78', '121.237.149.85', '121.237.148.202', '223.199.2.159', '121.41.0.202', '121.40.66.129', '183.5.99.10', '218.75.158.153', '119.134.110.115', '118.114.116.216', '117.88.5.140', '222.95.144.236', '121.237.149.17', '117.88.177.54', '222.95.240.216', '119.23.79.199', '117.88.177.246', '1.193.21.25', '101.231.104.82', '103.10.86.203', '183.167.217.152', '58.243.50.184', '117.88.177.40', '117.88.177.175', '202.105.136.92', '122.192.38.207', '117.88.177.189', '118.78.196.81', '111.206.217.186', '121.237.149.168', '124.239.216.14', '222.95.240.212', '121.237.149.104', '113.116.227.171', '115.204.70.15', '117.88.4.207', '110.87.176.121', '115.159.154.79', '171.13.249.207', '121.237.149.117', '117.88.4.249', '117.88.176.213', '59.110.154.102', '114.115.203.214', '117.88.176.170', '121.40.162.239', '117.88.5.28', '27.11.205.43', '120.132.116.81', '171.221.35.8', '121.237.148.151', '123.15.24.200', '183.165.35.104', '114.249.113.82', '59.52.187.80', '119.136.88.187', '113.88.211.244', '113.116.120.97', '27.44.4.138', '163.125.157.181', '27.43.108.60', '219.132.205.105', '120.78.168.189', '111.76.221.190', '111.160.169.54', '14.115.104.123', '119.123.76.117', '222.85.28.130', '112.95.204.185', '183.163.214.226', '58.212.111.176', '222.184.7.206', '183.136.177.77', '14.20.235.158', '218.75.109.86', '222.95.17.143', '115.223.69.125', '106.14.206.26', '117.87.180.144', '113.116.140.117', '118.114.116.214']
尝试从百度图片搜索中进行图片爬取
import time
import urllib.request
import urllib.parse
import urllib.error
import os
import re
def grabimg(item):
name = item.decode('utf-8')
target = str(item)
target = target.replace('\\', '')
target = target.replace('x', '%')
target = target.replace('b', '', 1)
target = target.replace('\'', '')
print("item : ", target)
url = 'http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&word=%s' % target
print(url)
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
response.close()
# print(html)
p = r'objURL.*?(http.*?)\"' #(?:(?:jp[e]?g)|(?:png)|(?:\"))
list1 = re.findall(p, html)
print(len(list1))
print(list1)
dir = name
try:
os.mkdir(dir)
except FileExistsError:
pass
os.chdir(dir)
header = {'User-Agent': 'Mozilla/5.0'}
count = 0
for each in list1:
filename = name + str(count) + '.jpg'
str1 = eval(repr(each).replace('\\', ''))
print(str1, filename)
try:
urllib.request.urlretrieve(str1, filename)
except:
print("open failed")
pass
count += 1
os.chdir('..')
if __name__ == '__main__':
while True:
keyword = input("输入想要搜集的图片关键字:(输入n退出)")
if keyword == 'n':
break
keyword = keyword.encode('utf-8')
print(keyword)
grabimg(keyword)
print("当前关键字图搜集完毕")