#爬取维基百科
#引入开发包
fromurllib.requestimport urlopen
from bs4 import BeautifulSoup
import re
import pymysql.cursors
#请求URL并使用UTF-8编码
resp = urlopen("https://en.wikipedia.org/wiki/Wikipedia").read().decode("utf-8")
#指定一个解析器
soup = BeautifulSoup(resp,"html.parser")
for ListUrl in soup.findAll("a",href = re.compile(r"^/wiki/")):
if not re.search("\.(jpg|JPG)$",ListUrl["href"]):#如果不是已jpg或者JPG结尾的才输出
print(ListUrl.string,"<-------->","https://en.wikipedia.org"+ListUrl["href"])
#获取数据库链接
connection =pymysql.connect(host = 'localhost',
user = 'root',
password = '19961016',
db = "wikiurl",
charset = "utf8mb4"
)
try:
#获取会话指针
with connection.cursor() as cursor:
#创建sql语句
sql = "insert into`urls`(`urlname`,`urlhref`)values(%s,%s)"
#执行sql语句
cursor.execute(sql,(ListUrl.get_text(),"https://en.wikipedia.org"+ListUrl["href"]))
#提交
connection.commit()
finally:
connection.close()
爬取信息到数据库入门操作
最新推荐文章于 2023-08-08 11:51:48 发布