'''
Created on 2018-4-9
@author: Administrator
'''
import io
import re
import sys
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')
resp = urlopen("https://en.wikipedia.org/wiki/Wiki").read().decode("utf-8")
soup = BeautifulSoup(resp,"html.parser")
listurls = soup.findAll("a",href=re.compile("^/wiki/"))
for url in listurls:
if not re.search("\.(jpg|JPG)$",url["href"]):
print(url.get_text(),"<----->","https://en.wikipedia.org"+url["href"])
conn = pymysql.Connect(
host ='localhost',
port = 3306,
user = 'root',
passwd = '123456',
db = 'wikiurl',
charset = 'utf8mb4'
)
try:
with conn.cursor() as cursor:
sql = "insert into `urls`(`urlname`,`urlhref`) values(%s,%s)"
cursor.execute(sql,(url.get_text(),"https://en.wikipedia.org"+url["href"]))
conn.commit()
finally:
conn.close()