#coding=utf-8 __author__ = 'mac' #Beautifulsoup的用法 #导入Beautifulsoup包 from bs4 import BeautifulSoup as bs from urllib.request import urlopen import re #引入开发包 import pymysql # 请求URL并把结果用utf-8编码 resp=urlopen("https://en.wikipedia.org/wiki/Main_page").read().decode("utf-8") # 使用BeautifulSoup去解析 soup=bs(resp,"html.parser") # 获取所有以/wiki开头的a 标签的href属性 listUrls=soup.findAll("a",href=re.compile("^/wiki/")) # 打印出url for url in listUrls: # print(url) #打印出来是整条a标签 if not re.search("\.(jpg|JPG)$",url["href"]): #上面取的有包含.jpg的图片,故要在href属性中排除 #将url的名字+"https://en.wikipedia.org"+url中的href属性合并打印出来 print(url.get_text(),"<---->","https://en.wikipedia.org"+url["href"]) #2.将数据写入数据库 #获取数据库链接 connection=pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='imooc', charset='utf8mb4') try: #获取会话指针,用于执行mysql # connection.cursor() with connection.cursor() as cursor: # 创建sql语句 sql="insert into `wikiinfo`(`urlname`,`urlhref`)VALUES (%s,%s)" #执行sql语句 cursor.execute(sql,(url.get_text(),"https://en.wikipedia.org"+url["href"])) #提交 connection.commit() finally: connection.close()
写入mysql数据库
最新推荐文章于 2023-09-21 22:43:53 发布