# -*- coding: utf-8 -*- # @Time : 2022/5/24 14:13 # @Author : admin # @Email : 1985264689@qq.com # @File : spider_bs.py # @Project : 项目 # @脚本说明 : import os.path import requests,re,time from bs4 import BeautifulSoup def spider_bs(): resp = requests.get("http://www.woniunote.com/") html = BeautifulSoup(resp.text,'html.parser') # print(html.head.title.string) links = html.find_all('a') for link in links: print(link['href']) keyword = html.select('#keyword') print(keyword[0]['placeholder']) titles = html.select('.title') for title in titles: print(title.string) def spider_yq(): reps = requests.get("http://jgpdf.ycywx.com/yuanqidesktop/oldbz.html?softid=585&tid1=133&tid2=1001&tod1=13428") # print(reps.text) html = BeautifulSoup(reps.text,'html.parser') logo = html.find_all(class_='logo') print(logo[0]['src']) abouts = html.find_all(class_='J_about') for about in abouts: print(about['src']) images = html.select('ul li img') for image in images: image = "http://jgpdf.ycywx.com/yuanqidesktop/" + image['data-src'] resp = requests.get(image) filename = image.split('/')[-2] + '-' + image.split('/')[-1] print(filename) # filename_1 = image.split('/')[-1] # textname = os.path.exists(f'./woniunote/{filename}') # if not textname : # with open(f'./woniunote/{filename}/{filename_1}','wb') as f: # f.write(resp.content) with open(f'./woniunote/image/{filename}','wb') as f: f.write(resp.content) def outer(func): def inner(): start = time.time() func() end = time.time() print(round(end - start,4)) return inner @outer def sum(): sum = 0 for i in range(100000000): sum+=i # return sum print(sum) if __name__ == '__main__': # spider_bs() # spider_yq() sum()
Python beautifulsoup爬虫
最新推荐文章于 2024-09-05 11:10:46 发布