Python 爬虫学习

最新推荐文章于 2024-05-02 14:08:21 发布

lixu_csdn

最新推荐文章于 2024-05-02 14:08:21 发布

阅读量137

点赞数

分类专栏： Python 文章标签： Python

本文链接：https://blog.csdn.net/lixu_csdn/article/details/89494920

版权

Python 专栏收录该内容

0 篇文章 0 订阅

订阅专栏

小白学习笔记，欢迎大神指教

import pymysql
import requests
from bs4 import BeautifulSoup
import time
import sys
#-----------------------------将爬取的数据封装---begin-------------------------
num = 0  # 定义条数的初始值
# 通过循环实现对不同页码的网页的数据爬取
for page in range(1):  # 以1页为例
    time.sleep(1)  # 延时1秒
    value = page * 10  # 考虑到start=后边的都是20的整倍数
    # 定义一个变量url，为需要爬取数据我网页网址（要将url由'https://movie.douban.com/subject/26683723/comments?status=P'换成'https://movie.douban.com/subject/26683723/comments?start=0&limit=20&sort=new_score&status=P&percent_type='）
    # 利用Python中字符串替换的方法：在要替换的地方用%s代替，在语句后%+要替换的内容
    url = 'https://music.douban.com/chart' #% str(page)  # str转型
    # 获取这个网页的源代码，存放在req中，{}中为不同浏览器的不同User-Agent属性，针对不同浏览器可以自行百度
    req = requests.get(url, {'User-Agent': 'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'})
    print(req.status_code)
    # 获得系统的编码
    type = sys.getfilesystemencoding()
    # 设置爬出内容的编码
    req.encoding = type
    # 生成一个Beautifulsoup对象，用以后边的查找工作
    soup = BeautifulSoup(req.text, 'html.parser')
    # 找到所有p标签中的内容并存放在xml这样一个类似于数组队列的对象中
    xml = soup.find_all('a')
    # 利用循环将xml[]中存放的每一条打印出来
    dataList = list()
    for i in range(len(xml)):  # 表示从0到xml的len()长度
        msg = xml[i].string
        if not msg is None:
            num += 1
            dataList.append(msg)
#-----------------------------将爬取的数据封装---end-------------------------
#-----------------------------将封装的数据存入到数据库---begin---------------

# 打开数据库连接
db = pymysql.connect("localhost", "root", "123456", "pythondb")

# 使用cursor()方法获取操作游标
cursor = db.cursor()

try:
    # 遍历list集合
    for i in dataList:
        print("序号：%s   值：%s" % (dataList.index(i) + 1, i))
        # SQL 插入语句
        sql = 'INSERT INTO test (content) VALUES ("'+i+'")'
        # 执行sql语句
        cursor.execute(sql)
        # 提交到数据库执行
        db.commit()
except:
    # 如果发生错误则回滚
    db.rollback()

# 关闭数据库连接
db.close()
#-----------------------------将封装的数据存入到数据库---end---------------