爬取信息到数据库入门操作

最新推荐文章于 2023-08-08 11:51:48 发布

阿懵

最新推荐文章于 2023-08-08 11:51:48 发布

阅读量691

点赞数

文章标签： python

本文链接：https://blog.csdn.net/mm1030533738/article/details/78317862

版权

#爬取维基百科

#引入开发包
fromurllib.requestimport urlopen
from bs4 import BeautifulSoup
import re

import pymysql.cursors

#请求URL并使用UTF-8编码
resp = urlopen("https://en.wikipedia.org/wiki/Wikipedia").read().decode("utf-8")

#指定一个解析器
soup = BeautifulSoup(resp,"html.parser")
for ListUrl in soup.findAll("a",href = re.compile(r"^/wiki/")):
    if not re.search("\.(jpg|JPG)$",ListUrl["href"]):#如果不是已jpg或者JPG结尾的才输出
        print(ListUrl.string,"<-------->","https://en.wikipedia.org"+ListUrl["href"])

        #获取数据库链接
        connection =pymysql.connect(host = 'localhost',
                             user = 'root',
                             password = '19961016',
                             db = "wikiurl",
                             charset = "utf8mb4"
                            )

        try:
            #获取会话指针
            with connection.cursor() as cursor:
                #创建sql语句
                sql = "insert into`urls`(`urlname`,`urlhref`)values(%s,%s)"
                #执行sql语句
                cursor.execute(sql,(ListUrl.get_text(),"https://en.wikipedia.org"+ListUrl["href"]))
                #提交
                connection.commit()
        finally:
            connection.close()