Python第一个爬虫demo--写作笔记
# coding:utf-8
from bs4 import BeautifulSoup
from urllib import request
import re
import time
test_url = 'http://dev.ruomengtv.com'
def getResponse(url):
url_response = request.urlopen(url).read().decode('utf-8')
return url_response
goods_name_list = []
def parse_html(html):
soup = BeautifulSoup(html, "lxml")
goods_list_soup = soup.find('div', attrs={'class': 'products'})
if goods_list_soup != None:
for goods_list in goods_list_soup.find_all('div', attrs={'class': 'item'}):
one_goods_detail = goods_list.find('div', attrs={'class': 'content'})
goods_name = one_goods_detail.find('div', attrs={'class': 'title'}).getText()
goods_name_list.append(goods_name)
return goods_name_list
# 批量下载图片,保存到指定路径
def batchDownloadJPGs(html, path='D:/pictures/'):
# 用于给图片命名
imgUrls=re.findall(re.compile(r'<img.+?src="(.+?\.jpg)"'), html)
# imgUrls = re.findall('"objURL":"(.*?)",', html, re.S) #百度图片下载正则
count = 1
for url in imgUrls:
request.urlretrieve(url,''.join([path, time.strftime("%Y-%m-%d_%H-%M-%S_", time.localtime())+'{0}.jpg'.format(count)]))
print('正在下载第' + str(count) + '张')
count = count + 1
def main():
#下载网页上的jpg文件
batchDownloadJPGs(getResponse(test_url))
#获取网站指定文件的标题
handle = parse_html(getResponse(test_url))
# handle = parse_html(download_page(test_url))
if handle != None:
handle = list(handle)
for ele in handle:
print(ele)
if __name__ == '__main__':
main()