花了一整天时间学习python,一整天时间都花在了看视频和尝试上了。网上找的视频真心不错,真的要感谢分享的人和主讲人,这里也分享一下自己在网上找的资源,希望有些帮助:http://pan.baidu.com/s/1eSGksl8
下午的时候闲着无聊,也顾不上基础牢不牢固,比着视频自己写了一个爬虫程序,真的是花了整整一个下午得时间,效率真的低到不行。估计是心态不行,中午竟然无所事事的睡到三点钟,真的是颓废了。
import urllib.request
import time
total_addr = []
URL = 'https://www.symmz.com'
KeyWord = ['xueshengmei', 'meinvxizao', 'mingxing', 'niuzaiku', 'sucaitu', 'wangluomeinv',
'kunbang', 'nvyou', 'chemo', 'meimei', 'neihantu', 'cosplay', 'ribenshaonv',
'rentiyishu', 'leisi', 'mishu', 'hushi', 'neiyi', 'hanguomeinv', 'duanzi', 'kunbang',
'sipai', 'duanqun', 'weimei', 'qingchun', 'qizhi', ]
Header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
def create_file(folder='D:/DownLoad'):
if not os.path.exists(folder):
os.mkdir(folder)
os.chdir(folder)
def link_open(url):
res = urllib.request.Request(url, Header)
html = urllib.request.urlopen(url)
html = html.read()
return html
def find_page(url, page):
page_addr = []
html = link_open(url)
html = html.decode('utf-8')
a = html.find(r'href="/' + page)
while a != -1:
b = html.find('html', a, a + 100)
if b != -1:
page_addr.append(html[a + 6:b + 4])
# print(html[a+6:b+4])
a = html.find(r'href="/' + page, b)
return page_addr
def get_image(url):
image_addr = []
# print(url)
html = link_open(url)
html = html.decode('utf-8')
a = html.find(r'src="http://img.symmz.com')
print(a)
while a != -1:
b = html.find('jpg', a, a + 56)
if b != -1:
image_addr.append(html[a + 5:b + 3])
print(html[a + 5:b + 3])
else:
print(b)
a = html.find(r'src="http://img.symmz.com', b)
return image_addr
def download_image(addr):
i = 000
create_file(folder='D:/DownLoad')
for each in addr:
filename = 'image' + str(i) + r'.jpg'
with open(filename, 'wb') as f:
img = link_open(each)
f.write(img)
i = i + 1
if i % 50 == 0:
time.sleep(2)
with open('log.txt', 'wb') as f:
for each in addr:
f.write(each + '\n')
#------------ 主体部分 ------------#
for page in KeyWord:
temp_url = URL + r'/' + page + r'.html'
print(temp_url)
print('\n')
get_addr = find_page(temp_url, page)
for each in get_addr:
temp = URL + each
image_addr = get_image(temp)
total_addr.extend(image_addr)
download_image(total_addr)
print('下载结束')
尚且不够好,源码在:https://github.com/ywxkgdw/Vortex/blob/master/LessonF.py