(1)爬取34个省区市所有的省-市-区-镇/乡名称
作业要求:
去中国国家统计网爬取中国现在34个省区市所有的省-市-区-镇/乡名称,即行政区划最小到镇或乡村或农场一级。并将爬取结果按各省或直辖市保存到文本文件中,每个省或直辖市一个文件。
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html
import requests
from bs4 import BeautifulSoup
import re
def getHTML(url):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
r = requests.get(url, headers = headers)
r.encoding = 'gbk'
return r.text
def getNextURL(url, rank):
dic = {}
HTML = getHTML(url)
bs = BeautifulSoup(HTML, 'html.parser')
for item in bs.find_all('tr',class_= rank+'tr'):
if item.a!=None:
dic[item.text] ='/'.join(url.split('/')[:-1])+'/'+item.a.attrs['href']
return dic
index = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
city={}
bs = BeautifulSoup(getHTML(index), 'html.parser')
for item in bs.find_all('a'):
city[item.text] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'+item.attrs['href']
city.pop('京ICP备05034670号')
for url1 in city.values():
with open(list(city.keys())[list(city.values()).index(url1)]+'.txt','w') as f:
f.write(list(city.keys())[list(city.values()).index(url1)]+'\n')
print('正在写入'+list(city.keys())[list(city.values()).index(url1)])
county = getNextURL(url1, 'city')
for url2 in county.values():
f.write(list(county.keys())[list(county.values()).index(url2)]+'\n')
town = getNextURL(url2, 'county')
for url3 in town.values():
f.write(list(town.keys())[list(town.values()).index(url3)]+'\n')
village = getNextURL(url3, 'town')
if village!=None:
for url4 in village.values():
f.write(list(village.keys())[list(village.values()).index(url4)]+'\n')
pattern = '<tr.+villagetr.+<td>(.+)</td><td>(.+)</td><td>(.+)</td></tr>'
p = re.compile(pattern)
sp = BeautifulSoup(getHTML(url4), 'html.parser')
for i in sp.find_all('tr',class_= 'villagetr'):
for j in p.findall(str(i)):
f.write(j[0]+'\t'+j[1]+'\t'+j[2]+'\n')
f.close()
print('ok.')
(1)爬取豆瓣小说前10页所有小说的信息
作业要求:
爬取豆瓣网小说网页前10页所有小说的信息,包括每一部小说的如下信息(作者、小说名、出版社、出版日期、价格、评分、评分人数、内容简介),并将这些信息存储到数据库中。数据库不限于sqlite,一部小说一条记录。最后需要实现查询某部小说信息的功能。
import requests
from bs4 import BeautifulSoup
import sqlite3
conn = sqlite3.connect('douban.db')
sql_tables = "create table noval(id INTEGER primary key autoincrement, author text, title text, press text, date text, price text, score text, people text, content text )"
conn.execute(sql_tables)
conn.commit()
for index in range(0,200,20):
url = 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start='+str(index)+'&type=T'
print('正在准备爬取第'+str(index//20+1)+'页')
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
html = requests.get(url,headers = headers).text
bs = BeautifulSoup(html,'html.parser')
pub = bs.select('div.pub')
author = [item.text.split('/')[0].strip() for item in pub]
title = [item.text.strip() for item in bs.select('h2')]
press = [item.text.split('/')[-3].strip() for item in pub]
date = [item.text.split('/')[-2].strip() for item in pub]
price = [item.text.split('/')[-1].strip() for item in pub]
score = [item.text for item in bs.select('span.rating_nums')]
people = [item.text.strip() for item in bs.select('span.pl')]
content = [item.text for item in bs.select('div.info p')]
for i in range(20):
if (index==60 and i==4) or (index==140 and i==2) or index==140 and i==10:
content.insert(i,'')
sql = "insert into noval values(null, '{}','{}','{}','{}','{}','{}','{}','{}')".format(author[i],title[i],press[i],date[i],price[i],score[i],people[i],content[i])
conn.execute(sql)
conn.commit()
print('ok')
sql_select = 'select * from noval'
cursor = conn.cursor()
cursor.execute(sql_select)
conn.commit()
result = cursor.fetchall()
print(result)
sql = 'select title from noval1 where author="余华"'
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
result = cursor.fetchall()