import os import re import pymysql import requests import random import urllib.request from bs4 import BeautifulSoup from openpyxl import Workbook #获取网页信息 def get_html(url): #设置请求头 head={ 'connection':'keep-alive', 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } #发送请求 r = requests.get(url,headers=head) r.encoding='utf-8' return r.text #保存数据 x=1 #清洗数据 datalist =[] def get_data(data): soup = BeautifulSoup(data,'html.parser') divs = soup.find_all('div', attrs={'class':'limitContent'}) for div in divs: a = div.find_all('a',attrs={'target':'_blank'}) for li in a: lis = [] #创建保存照片的地址 if not os.path.isdir('photos'): os.mkdir('photos') global x dizhi = li.find('img')['src'] # # 图片保存地址 bc = 'photos/' + str(x) + ".jpg" urllib.request.urlretrieve(dizhi,bc) #书名 shuming = li.find('div',attrs={'class':'limitDesc'}).text.split('\n')[1] #作者 zuzhe = li.find('div',attrs={'class':'limitDesc'}).text.split('\n')[2] #折扣价 zhekoujia =li.find('div',attrs={'class':'limitDesc'}).text.split('\n')[4].split('¥')[1] #原价 yuanjian = li.find('div',attrs={'class':'limitDesc'}).text.split('\n')[4].split('¥')[2] lis.append(shuming) lis.append(zuzhe) lis.append(zhekoujia) lis.append(yuanjian) lis.append(dizhi) datalist.append(lis) x+=1 return datalist urls = 'http://e.dangdang.com/index_page.html' data = get_html(urls) get_data(data) print(datalist) #保存数据 def saveExcel(): wbk = Workbook() sheet = wbk.create_sheet('product',0) sheet.cell(1,1).value='书名' sheet.cell(1, 2).value = '作者' sheet.cell(1, 3).value = '折扣价' sheet.cell(1, 4).value = '原价' sheet.cell(1, 5).value = '图片地址' for a in range(len(datalist)): for b in range(len(datalist[a])): sheet.cell(a+2,b+1).value=datalist[a][b] wbk.save('product.xlsx') saveExcel() #联接数据库 def get_sql(): conn=pymysql.connect(host='127.0.0.1',user='root',password='123456',db='product',charset='utf8') return conn #创建表 def createSql(): conn = get_sql() #简历游标 cur=conn.cursor() #创建表的sql语句 sql="create table product(shuming varchar(255),zuozhe varchar(255),zhekou varchar(255),price varchar(255),url varchar(255))default charset=utf8" #执行sql语句 cur.execute(sql) #提交 conn.commit() # createSql() def saveSql(data): conn =get_sql() cur = conn.cursor() #执行sql语句 cur.executemany("insert into product values(%s, %s, %s, %s, %s)",data) # #提交数据库 conn.commit() # #关闭游标和链接 conn.close() cur.close() saveSql(datalist)
python爬取当当图片和信息
最新推荐文章于 2024-06-17 15:56:35 发布