#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# import pymysql
# conn= pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='mysql')
# cur=conn.cursor()
# cur.execute("use scraping")
# cur.execute("select * from pages where id=1")
# print(cur.fetchone())
# cur.close()
# conn.close()
# 采集页面存储到数据库中
import datetime
import random
import re
from urllib.request import urlopen
import pymysql
from bs4 import BeautifulSoup
conn= pymysql.connect(host='127.0.0.1',user='root',passwd='root',db='mysql',charset='utf8')
cur=conn.cursor()
cur.execute("use scraping")
random.seed(datetime.datetime.now())
def store(title,content):
cur.execute("insert into pages(title,content) values (\"%s\",\"%s\")",(title,content))
cur.connection.commit()
def getLinks(articleUrl):
html=urlopen("https://en.wikipedia.org/"+articleUrl)
bsObj=BeautifulSoup(html,"html.parser")
title=bsObj.find("h1").get_text()
content=bsObj.find("div",{"id":"mw-content-text"}).find("p").get_text()
store(title,content)
return bsObj.find("div",{"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
links=getLinks("/wiki/Kevin_Bacon")
try:
while len(links)>0:
newArticle=links[random.randint(0,len(links)-1)].attrs['href']
print(newArticle)
links=getLinks(newArticle)
finally:
cur.close()
conn.close()