from copy import copy
import requests
import re
from bs4 import BeautifulSoup
import urllib.request
import ssl
import DBUtils
import xlwt
import xlrd
from xlutils.copy import copy
ssl._create_default_https_context = ssl._create_unverified_context
def getContent():
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
'Connection': 'keep-alive'
}
url = "http://top.baidu.com/boards?fr=topindex"
r = requests.get(url, headers=headers)
r.encoding = 'GB2312'
page = BeautifulSoup(r.text, "html.parser")
return page
def yin(li):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
'Connection': 'keep-alive'
}
url = "http://top.baidu.com%s"%li
r = requests.get(url, headers=headers)
r.encoding = 'GB2312'
pa = BeautifulSoup(r.text, "html.parser")
return pa
def list():
soup = getContent()
a = soup.find("div",attrs={"class":"links"})
list = re.findall(r'href="(.+?)"',str(a))
list.pop()
return list
def lei():
soup = getContent()
v = soup.find("div",attrs={"class":"links"})
le = v.find_all("a")
lei = []
for k in le:
o = k.string
lei.append(o)
lei.pop()
return lei
def inp(li,x,bie):
cou = yin(li)
a = cou.find_all("a",attrs={"class":"list-title"})
b = cou.find_all("td",attrs={"class":"last"})
namelist = []
fenlist = []
for i in a:
q = i.string
namelist.append(q)
for w in b:
h = w.find("span")
e = h.string
fenlist.append(e)
conn = DBUtils.getConnect()
cursor = conn.cursor()
for f in range(0,len(namelist)):
sql = "Insert into fengyun (name,cat,inde)values('%s','%s',%s);"%(namelist[f],bie,fenlist[f])
DBUtils.insertData(sql,cursor,conn)
DBUtils.closeConnect(cursor,conn)
wb = xlrd.open_workbook("风云.xls")
copyWb = copy(wb)
sheet = copyWb.get_sheet(0)
for (i, tuple) in enumerate(namelist):
sheet.write(i + 1 + x * 50, 1, tuple)
sheet.write(i + 1 + x * 50, 2, bie)
sheet.write(i + 1 + x * 50, 3, fenlist[i])
sheet.write(i + 1 + x * 50, 0,i + 1 * x)
copyWb.save("风云.xls")
def saveExcel():
wb = xlwt.Workbook()
sheet = wb.add_sheet("百度风云")
header = ["顺序", "电影名", "类别", "评分"]
for (i,v) in enumerate(header):
sheet.write(0,i,v)
wb.save("风云.xls")
li = list()
lei = lei()
saveExcel()
x = 0
for p in range(0,len(li)):
inp(li[p],x,lei[p])
x += 1
print("完成")
import pymysql.cursors
def getConnect():
conn = pymysql.connect(host="", user="root", password="123", database="pymysql", charset="utf8")
return conn
def closeConnect(cursor,conn):
if cursor:
cursor.close()
if conn:
conn.close()
def insertData(sql,cursor,conn):
cursor.execute(sql)
conn.commit()