本文为学习笔记,记录了由University of Michigan推出的Coursera专项课程——Python for Everybody中Chapter12、13、15及16中的部分样例和全部作业代码。其中,作业代码均已通过测试。
一、Chapter 12
0. Sample Code
# To run this, download the BeautifulSoup zip file
# http://www.py4e.com/code3/bs4.zip
# and unzip it in the same directory as this file
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter - ')
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
# Look at the parts of a tag
print('TAG:', tag)
print('URL:', tag.get('href', None))
print('Contents:', tag.contents[0])
print('Attrs:', tag.attrs)
1. Scraping HTML Data with BeautifulSoup
import urllib.request,urllib.parse,urllib.error
from bs4 import BeautifulSoup
import re
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter-')
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html,'html.parser')
total = 0
count= 0
tags = soup('span')
for tag in tags:
str=tag.contents[0]
lst=re.findall('[0-9]+', str)
try:
num = int(lst[0])
count+= 1
total+= num
except:
continue
print('Count', count)
print('Sum', total)
2. Following Links in HTML Using BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input("Enter URL: ")
count = int(input("Enter count: "))
position = int(input("Enter position: "))-1
for i in range(count):
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
lst = list()
tags = soup('a')
for tag in tags:
lst.append(tag.get('href', None))
url = lst[position]
print("Retrieving: ", url)
二、Chapter 13
1. Extracting Data from XML
import urllib.request, urllib.parse,urllib.error
import ssl
import xml.etree.ElementTree as ET
total = 0
url = input("Enter location: ")
print("Retrieving ", url)
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
uh = urllib.request.urlopen(url, context=ctx)
data = uh.read()
print('Retrieved', len(data), 'characters')
tree = ET.fromstring(data)
counts = tree.findall('.//count')
print('Count: ', len(counts))
lst=tree.findall('comments/comment')
for i in lst:
num = int(i.find('count').text)
total+= num
print("Sum: ", total)
2. Extracting Data from JSON
import urllib.request, urllib.parse,urllib.error
import ssl
import json
total = 0
url = input("Enter location: ")
print("Retrieving ", url)
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
uh = urllib.request.urlopen(url, context=ctx)
data = uh.read()
print('Retrieved', len(data),'characters')
info = json.loads(data)
print('Count:', len(info['comments']))
for dict in info['comments']:
item = dict['count']
total+= int(item)
print('Sum: ', total)
3. Using the GeoJSON API
import urllib.request, urllib.parse, urllib.error
import json
import ssl
api_key = False
if api_key is False:
api_key = 42
serviceurl = 'http://py4e-data.dr-chuck.net/json?'
else :
serviceurl = 'https://maps.googleapis.com/maps/api/geocode/json?'
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
while True:
address = input('Enter location: ')
if len(address) < 1: break
parms = dict()
parms['address'] = address
if api_key is not False: parms['key'] = api_key
url = serviceurl + urllib.parse.urlencode(parms)
print('Retrieving', url)
uh = urllib.request.urlopen(url, context=ctx)
data = uh.read().decode()
print('Retrieved', len(data), 'characters')
try:
js = json.loads(data)
except:
js = None
if not js or 'status' not in js or js['status'] != 'OK':
print('==== Failure To Retrieve ====')
continue
place_id = js['results'][0]['place_id']
print('Place id',place_id)
三、Chapter 15
1. Counting Email in a Database
① Sample Code
import sqlite3
conn = sqlite3.connect('emaildb.sqlite')
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS Counts')
cur.execute('''
CREATE TABLE Counts (email TEXT, count INTEGER)''')
fname = input('Enter file name: ')
if (len(fname) < 1): fname = 'mbox-short.txt'
fh = open(fname)
for line in fh:
if not line.startswith('From: '): continue
pieces = line.split()
email = pieces[1]
cur.execute('SELECT count FROM Counts WHERE email = ? ', (email,))
row = cur.fetchone()
if row is None:
cur.execute('''INSERT INTO Counts (email, count)
VALUES (?, 1)''', (email,))
else:
cur.execute('UPDATE Counts SET count = count + 1 WHERE email = ?',
(email,))
conn.commit()
# https://www.sqlite.org/lang_select.html
sqlstr = 'SELECT email, count FROM Counts ORDER BY count DESC LIMIT 10'
for row in cur.execute(sqlstr):
print(str(row[0]), row[1])
cur.close()
② Assignment
#Chapter 15--Counting Email in a Database
#输入文件:mbox.txt/输出文件:orgdb.sqlite
import sqlite3
import re
conn = sqlite3.connect('orgdb.sqlite')
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS Counts')
cur.execute('''
CREATE TABLE Counts (org TEXT, count INTEGER)''')
fname=input('Enter file name: ') # fname >>> mbox.txt
fh = open(fname)
for line in fh:
if not line.startswith('From: '): continue
pieces = line.split()
email=pieces[1]
org = re.findall('@(.+)',email)[0]
cur.execute('SELECT count FROM Counts WHERE org = ? ', (org,))
row = cur.fetchone()
if row is None:
cur.execute('''INSERT INTO Counts (org, count)
VALUES (?, 1)''', (org,))
else:
cur.execute('UPDATE Counts SET count = count + 1 WHERE org = ?',
(org,))
conn.commit()
#以下代码仅用作验证结果
sqlstr = 'SELECT org, count FROM Counts ORDER BY count DESC'
for row in cur.execute(sqlstr):
print(str(row[0]), row[1])
cur.close()
2. Multi-Table Database - Tracks
① Sample Code
import xml.etree.ElementTree as ET
import sqlite3
conn = sqlite3.connect('trackdb.sqlite')
cur = conn.cursor()
# Make some fresh tables using executescript()
cur.executescript('''
DROP TABLE IF EXISTS Artist;
DROP TABLE IF EXISTS Album;
DROP TABLE IF EXISTS Track;
CREATE TABLE Artist (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
name TEXT UNIQUE
);
CREATE TABLE Album (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
artist_id INTEGER,
title TEXT UNIQUE
);
CREATE TABLE Track (
id INTEGER NOT NULL PRIMARY KEY
AUTOINCREMENT UNIQUE,
title TEXT UNIQUE,
album_id INTEGER,
len INTEGER, rating INTEGER, count INTEGER
);
''')
fname = input('Enter file name: ')
if ( len(fname) < 1 ) : fname = 'Library.xml'
# <key>Track ID</key><integer>369</integer>
# <key>Name</key><string>Another One Bites The Dust</string>
# <key>Artist</key><string>Queen</string>
def lookup(d, key):
found = False
for child in d:
if found : return child.text
if child.tag == 'key' and child.text == key :
found = True
return None
stuff = ET.parse(fname)
all = stuff.findall('dict/dict/dict')
print('Dict count:', len(all))
for entry in all:
if ( lookup(entry, 'Track ID') is None ) : continue
name = lookup(entry, 'Name')
artist = lookup(entry, 'Artist')
album = lookup(entry, 'Album')
count = lookup(entry, 'Play Count')
rating = lookup(entry, 'Rating')
length = lookup(entry, 'Total Time')
if name is None or artist is None or album is None :
continue
print(name, artist, album, count, rating, length)
cur.execute('''INSERT OR IGNORE INTO Artist (name)
VALUES ( ? )''', ( artist, ) )
cur.execute('SELECT id FROM Artist WHERE name = ? ', (artist, ))
artist_id = cur.fetchone()[0]
cur.execute('''INSERT OR IGNORE INTO Album (title, artist_id)
VALUES ( ?, ? )''', ( album, artist_id ) )
cur.execute('SELECT id FROM Album WHERE title = ? ', (album, ))
album_id = cur.fetchone()[0]
cur.execute('''INSERT OR REPLACE INTO Track
(title, album_id, len, rating, count)
VALUES ( ?, ?, ?, ?, ? )''',
( name, album_id, length, rating, count ) )
conn.commit()
② Assignment
import xml.etree.ElementTree as ET
import sqlite3
conn = sqlite3.connect('trackdb.sqlite')
cur = conn.cursor()
cur.executescript('''
DROP TABLE IF EXISTS Artist;
DROP TABLE IF EXISTS Genre;
DROP TABLE IF EXISTS Album;
DROP TABLE IF EXISTS Track;
CREATE TABLE Artist (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
name TEXT UNIQUE
);
CREATE TABLE Genre (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
name TEXT UNIQUE
);
CREATE TABLE Album (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
artist_id INTEGER,
title TEXT UNIQUE
);
CREATE TABLE Track (
id INTEGER NOT NULL PRIMARY KEY
AUTOINCREMENT UNIQUE,
title TEXT UNIQUE,
album_id INTEGER,
genre_id INTEGER,
len INTEGER, rating INTEGER, count INTEGER
);
''')
fname = 'Library.xml'
def lookup(d, key):
found = False
for child in d:
if found : return child.text
if child.tag == 'key' and child.text == key :
found = True
return None
stuff = ET.parse(fname)
all = stuff.findall('dict/dict/dict')
for entry in all:
if ( lookup(entry, 'Track ID') is None ) : continue
name = lookup(entry, 'Name')
artist = lookup(entry, 'Artist')
genre = lookup(entry, 'Genre')
album = lookup(entry, 'Album')
track = lookup(entry, 'Track')
count = lookup(entry, 'Play Count')
rating = lookup(entry, 'Rating')
length = lookup(entry, 'Total Time')
if name is None or artist is None or genre is None or album is None :
continue
cur.execute('''INSERT OR IGNORE INTO Artist (name)
VALUES ( ? )''', ( artist, ) )
cur.execute('SELECT id FROM Artist WHERE name = ? ', (artist, ))
artist_id = cur.fetchone()[0]
cur.execute('''INSERT OR IGNORE INTO Genre (name)
VALUES ( ? )''', ( genre, ) )
cur.execute('SELECT id FROM Genre WHERE name = ? ', (genre, ))
genre_id = cur.fetchone()[0]
cur.execute('''INSERT OR IGNORE INTO Album (title, artist_id)
VALUES ( ?, ? )''', ( album, artist_id ) )
cur.execute('SELECT id FROM Album WHERE title = ? ', (album, ))
album_id = cur.fetchone()[0]
cur.execute('''INSERT OR REPLACE INTO Track
(title, album_id, genre_id, len, rating, count)
VALUES ( ?, ?, ?, ?, ?, ? )''',
( name, album_id, genre_id, length, rating, count) )
conn.commit()
#用于检验结果的SQL语句:
sqlstr = '''SELECT Track.title, Artist.name, Album.title, Genre.name
FROM Track JOIN Genre JOIN Album JOIN Artist
ON Track.genre_id = Genre.ID and Track.album_id = Album.id
AND Album.artist_id = Artist.id
ORDER BY Artist.name, Track.title LIMIT 3'''
for row in cur.execute(sqlstr):
print(str(row[0]), row[1],row[2],row[3])
cur.close()
3. Many Students in Many Courses
① Sample Code
import json
import sqlite3
conn = sqlite3.connect('rosterdb.sqlite')
cur = conn.cursor()
# Do some setup
cur.executescript('''
DROP TABLE IF EXISTS User;
DROP TABLE IF EXISTS Member;
DROP TABLE IF EXISTS Course;
CREATE TABLE User (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
name TEXT UNIQUE
);
CREATE TABLE Course (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
title TEXT UNIQUE
);
CREATE TABLE Member (
user_id INTEGER,
course_id INTEGER,
role INTEGER,
PRIMARY KEY (user_id, course_id)
)
''')
fname = input('Enter file name: ')
if len(fname) < 1:
fname = 'roster_data_sample.json'
# [
# [ "Charley", "si110", 1 ],
# [ "Mea", "si110", 0 ],
str_data = open(fname).read()
json_data = json.loads(str_data)
for entry in json_data:
name = entry[0]
title = entry[1]
print((name, title))
cur.execute('''INSERT OR IGNORE INTO User (name)
VALUES ( ? )''', ( name, ) )
cur.execute('SELECT id FROM User WHERE name = ? ', (name, ))
user_id = cur.fetchone()[0]
cur.execute('''INSERT OR IGNORE INTO Course (title)
VALUES ( ? )''', ( title, ) )
cur.execute('SELECT id FROM Course WHERE title = ? ', (title, ))
course_id = cur.fetchone()[0]
cur.execute('''INSERT OR REPLACE INTO Member
(user_id, course_id) VALUES ( ?, ? )''',
( user_id, course_id ) )
conn.commit()
② Assignment
import json
import sqlite3
conn = sqlite3.connect('rosterdb.sqlite')
cur = conn.cursor()
cur.executescript('''
DROP TABLE IF EXISTS User;
DROP TABLE IF EXISTS Member;
DROP TABLE IF EXISTS Course;
CREATE TABLE User (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
name TEXT UNIQUE
);
CREATE TABLE Course (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
title TEXT UNIQUE
);
CREATE TABLE Member (
user_id INTEGER,
course_id INTEGER,
role INTEGER,
PRIMARY KEY (user_id, course_id)
)
''')
fname = 'roster_data.json'
str_data = open(fname).read()
json_data = json.loads(str_data)
for entry in json_data:
name = entry[0]
title = entry[1]
role =entry[2]
cur.execute('''INSERT OR IGNORE INTO User (name)
VALUES ( ? )''', ( name, ) )
cur.execute('SELECT id FROM User WHERE name = ? ', (name, ))
user_id = cur.fetchone()[0]
cur.execute('''INSERT OR IGNORE INTO Course (title)
VALUES ( ? )''', ( title, ) )
cur.execute('SELECT id FROM Course WHERE title = ? ', (title, ))
course_id = cur.fetchone()[0]
cur.execute('''INSERT OR REPLACE INTO Member
(user_id, course_id, role) VALUES ( ?, ?, ? )''',
( user_id, course_id, role ) )
conn.commit()
sqlstr1 = '''SELECT User.name,Course.title, Member.role FROM
User JOIN Member JOIN Course
ON User.id = Member.user_id AND Member.course_id = Course.id
ORDER BY User.name DESC, Course.title DESC, Member.role DESC LIMIT 2'''
for row in cur.execute(sqlstr1):
print(str(row[0]), row[1], row[2])
sqlstr2 = '''SELECT 'XYZZY' || hex(User.name || Course.title || Member.role ) AS X FROM
User JOIN Member JOIN Course
ON User.id = Member.user_id AND Member.course_id = Course.id
ORDER BY X LIMIT 1'''
for row in cur.execute(sqlstr2):
print(str(row[0]))
cur.close()
四、Chapter 16
1. Step 1
① geoload
import urllib.request, urllib.parse, urllib.error
import http
import sqlite3
import json
import time
import ssl
import sys
api_key = False
# If you have a Google Places API key, enter it here
if api_key is False:
api_key = 42
serviceurl = "http://py4e-data.dr-chuck.net/json?"
else :
serviceurl = "https://maps.googleapis.com/maps/api/geocode/json?"
# Additional detail for urllib
# http.client.HTTPConnection.debuglevel = 1
conn = sqlite3.connect('geodata.sqlite')
cur = conn.cursor()
cur.execute('''
CREATE TABLE IF NOT EXISTS Locations (address TEXT, geodata TEXT)''')
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
fh = open("where.data")
count = 0
for line in fh:
if count > 200 :
print('Retrieved 200 locations, restart to retrieve more')
break
address = line.strip()
print('')
cur.execute("SELECT geodata FROM Locations WHERE address= ?",
(memoryview(address.encode()), ))
try:
data = cur.fetchone()[0]
print("Found in database ",address)
continue
except:
pass
parms = dict()
parms["address"] = address
if api_key is not False: parms['key'] = api_key
url = serviceurl + urllib.parse.urlencode(parms)
print('Retrieving', url)
uh = urllib.request.urlopen(url, context=ctx)
data = uh.read().decode()
print('Retrieved', len(data), 'characters', data[:20].replace('\n', ' '))
count = count + 1
try:
js = json.loads(data)
except:
print(data) # We print in case unicode causes an error
continue
if 'status' not in js or (js['status'] != 'OK' and js['status'] != 'ZERO_RESULTS') :
print('==== Failure To Retrieve ====')
print(data)
break
cur.execute('''INSERT INTO Locations (address, geodata)
VALUES ( ?, ? )''', (memoryview(address.encode()), memoryview(data.encode()) ) )
conn.commit()
if count % 10 == 0 :
print('Pausing for a bit...')
time.sleep(5)
print("Run geodump.py to read the data from the database so you can vizualize it on a map.")
2. Step 2
②geodump
import sqlite3
import json
import codecs
conn = sqlite3.connect('geodata.sqlite')
cur = conn.cursor()
cur.execute('SELECT * FROM Locations')
fhand = codecs.open('where.js', 'w', "utf-8")
fhand.write("myData = [\n")
count = 0
for row in cur :
data = str(row[1].decode())
try: js = json.loads(str(data))
except: continue
if not('status' in js and js['status'] == 'OK') : continue
lat = js["results"][0]["geometry"]["location"]["lat"]
lng = js["results"][0]["geometry"]["location"]["lng"]
if lat == 0 or lng == 0 : continue
where = js['results'][0]['formatted_address']
where = where.replace("'", "")
try :
print(where, lat, lng)
count = count + 1
if count > 1 : fhand.write(",\n")
output = "["+str(lat)+","+str(lng)+", '"+where+"']"
fhand.write(output)
except:
continue
fhand.write("\n];\n")
cur.close()
fhand.close()
print(count, "records written to where.js")
print("Open where.html to view the data in a browser")