python爬虫百度风云榜_py爬取百度风云榜-CSDN博客

本文链接：https://blog.csdn.net/weixin_56914091/article/details/117338212

from copy import copy
import requests
import re
from bs4 import BeautifulSoup
import urllib.request
import ssl
import DBUtils
import xlwt
import xlrd
from xlutils.copy import copy
ssl._create_default_https_context = ssl._create_unverified_context
def getContent():
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
        'Connection': 'keep-alive'
    }
    url = "http://top.baidu.com/boards?fr=topindex"
    ##请求对象（url+请求头）
    r = requests.get(url, headers=headers)
    #乱码
    r.encoding = 'GB2312'
    ##获取页面内容
    page = BeautifulSoup(r.text, "html.parser")
    return page
def yin(li):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
        'Connection': 'keep-alive'
    }
    url = "http://top.baidu.com%s"%li
    ##请求对象（url+请求头）
    r = requests.get(url, headers=headers)
    #乱码
    r.encoding = 'GB2312'
    ##获取页面内容
    pa = BeautifulSoup(r.text, "html.parser")
    return pa
#返回的五个界面连接
def list():
    soup = getContent()
    a = soup.find("div",attrs={"class":"links"})
    list = re.findall(r'href="(.+?)"',str(a))
    list.pop()
    return list
#返回五个类别的名字
def lei():
    soup = getContent()
    v = soup.find("div",attrs={"class":"links"})
    le = v.find_all("a")
    lei = []
    for k in le:
        o = k.string
        lei.append(o)
    lei.pop()
    return lei
#添加数据 x是页数,bie是类别
def inp(li,x,bie):
    cou = yin(li)
    a = cou.find_all("a",attrs={"class":"list-title"})
    b = cou.find_all("td",attrs={"class":"last"})
    namelist = []
    fenlist = []
    for i in a:
        q = i.string
        namelist.append(q)
    for w in b:
        h = w.find("span")
        e = h.string
        fenlist.append(e)
    conn = DBUtils.getConnect()
    cursor = conn.cursor()
    for f in range(0,len(namelist)):
        sql = "Insert into fengyun (name,cat,inde)values('%s','%s',%s);"%(namelist[f],bie,fenlist[f])
        DBUtils.insertData(sql,cursor,conn)
    DBUtils.closeConnect(cursor,conn)
    wb = xlrd.open_workbook("风云.xls")
    # 复制一份工作薄，用来写入
    copyWb = copy(wb)
    # 通过索引获取表
    sheet = copyWb.get_sheet(0)
    for (i, tuple) in enumerate(namelist):
        sheet.write(i + 1 + x * 50, 1, tuple)
        sheet.write(i + 1 + x * 50, 2, bie)
        sheet.write(i + 1 + x * 50, 3, fenlist[i])
        sheet.write(i + 1 + x * 50, 0,i + 1 * x)
    # 保存，如果文件名和之前一样，覆盖
    # 文件名不存在：新的文件
    copyWb.save("风云.xls")
#创建工作薄
def saveExcel():
    wb = xlwt.Workbook()
    sheet = wb.add_sheet("百度风云")
    header = ["顺序", "电影名", "类别", "评分"]
    for (i,v) in enumerate(header):
        sheet.write(0,i,v)
    wb.save("风云.xls")
# li = "./buzz?b=338"
# inp(li)
# print(lei())
li = list()
lei = lei()
saveExcel()
x = 0
for p in range(0,len(li)):
    inp(li[p],x,lei[p])
    x += 1
print("完成")

import pymysql.cursors

#获取连接
def getConnect():
    conn = pymysql.connect(host="", user="root", password="123", database="pymysql", charset="utf8")
    return conn
#关闭连接
def closeConnect(cursor,conn):
    if cursor:
        cursor.close()
    if conn:
        conn.close()
def insertData(sql,cursor,conn):
    cursor.execute(sql)
    conn.commit()