1.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql
conn=pymysql.connect(host='localhost',user='root',passwd='root',db='mydb',port=3306,charset='utf8')
cursor=conn.cursor()
header={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
html=urlopen("https://www.sdut.edu.cn/")
bsObj=BeautifulSoup(html,'html.parser')
titles=bsObj.findAll('div',{'class':'xw-bt'})
for title in titles:
print(title.text)
cursor.execute(
"insert into t_title (title) VALUES (%s)",(str(title.text))
)
conn.commit()
2
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
url_path='https://lgwindow.sdut.edu.cn'
urls=['https://lgwindow.sdut.edu.cn/lgyx/list{}.htm'.format(str(i))for i in range(1,8)]
path='C://Users/14760/Desktop/photo/'
header = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
}
list=[]
def get_photo(url):
html=urlopen(url)
soup=BeautifulSoup(html,'lxml')
imgs=soup.select('table>tr>td>div>div>a>img')
for img in imgs:
photo=img.get('src')
print(photo)
list.append(url_path+photo)
for item in list:
data=requests.get(item)
fp=open(path+item[-8:],'wb')
fp.write(data.content)
fp.close()
for url in urls:
get_photo(url)
print(len(list))
3.1
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
llist=set()#集合去重
imgs=set()
path='C://Users/14760/Desktop/photo/'#本地目录
html = urlopen('https://sou.autohome.com.cn/zonghe?q=T-cross&pvareaid=3311668&page=1&from_type=3')//主网址
soup = BeautifulSoup(html, 'lxml')
urls = soup.select('dl.list-dl>dt>a')#选择主页上的分页链接
for u in urls:
u=u.get("href")
if(re.findall('http\:\/\/(.*?)',u)):
llist.add(u)
print(u)
#找到以http:开头的链接
def get_photo(url):
html=urlopen(url)
soup=BeautifulSoup(html,'lxml')
imgs2=soup.findAll('img',{'data-src':re.compile("^(.*?)\.autoimg\.cn\/cardfs\/product(.*)")})
imgs1=soup.findAll('img',{'data-imageurl':re.compile("^(.*?)\.autoimg\.cn\/youchuang\/(.*)")})
# 图片的两种表示方法,从中取放在img集合中
for img in imgs1:
imgs.add(img)
for img in imgs2:
imgs.add(img)
for url in llist:#遍历url集合中的每一个url,调用函数
get_photo(url)
for img in imgs:
if(img.get('data-src')):
img=img.get('data-src')
#取标签为data-src的图片地址
else:
img=img.get('data-imageurl')
#取标签为data-imageurl的图片地址
print(img)
#统一图片的地址
if(re.findall('https\:\/\/(.*?)',img)):
print(img)
data=requests.get(img)
elif(re.findall('http\:\/\/(.*?)',img)):
print(img)
data = requests.get(img)
else:
data=requests.get("https:"+img)
#打开本地文件夹储,存进去
fp=open(path+img[-8:],'wb')
fp.write(data.content)
#关闭文件
fp.close()