(1)去中国国家统计网中国现在34个省区市所有的省-市-区-镇/乡名称,即行政区划最小到镇或乡村或农场一级。并将结果按各省或直辖市保存到文本文件中,每个省或直辖市一个文件。链接地址为:https://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html
(2)豆瓣网小说网页前10页所有小说的信息,包括每一部小说的如下信息(作者、小说名、
出版社、出版日期、价格、评分、评分人数、内容简介),并将这些信息存储到数据库中。数据库不限于sqlite,一部小说一条记录。最后需要实现查询某部小说信息的功能。网页地址为:https://book.douban.com/tag/小说
答:
(1)
import requests
import re
#爬源代码
root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
def getHTML(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
html = r.text
return html
html = getHTML(root_url)
#正则表达式
pattern ="<td><a href='(.+?)'>(.+?)<br/></a></td>"
p = re.compile(pattern)
new_link = p.findall(html)
new_url=[root_url[:root_url.rfind("/")+1]+item[0] for item in new_link]
new_text=[item[1] for item in new_link]
#爬取并写入文件
def Crawling(new_url,new_text):
for i,a in zip(new_url, new_text):
f = open(a + '.txt', 'w', encoding='utf-8')
html = getHTML(i)
pattern = "<tr class='citytr'><td><a href='.+?'>.+?</a></td><td><a href='(.+?)'>(.+?)</a></td>"
p = re.compile(pattern)
new_link = p.findall(html)
new_url = [i[:i.rfind("/")+1]+ item[0] for item in new_link]
new_text = [item[1] for item in new_link]
for b in new_text:
f.writelines(b+"\n")
for i in new_url:
html = getHTML(i)
pattern = "<tr class='countytr'><td><a href='.+?'>.+?</a></td><td><a href='(.+?)'>(.+?)</a></td>"
p = re.compile(pattern)
new_link = p.findall(html)
new_url = [i[:i.rfind("/") + 1] + item[0] for item in new_link]
new_text = [item[1] for item in new_link]
for b in new_text:
f.writelines(b + "\n")
for i in new_url:
html = getHTML(i)
pattern = "<tr class='towntr'><td><a href='.+?'>.+?</a></td><td><a href='(.+?)'>(.+?)</a></td>"
p = re.compile(pattern)
new_link = p.findall(html)
new_url = [i[:i.rfind("/") + 1] + item[0] for item in new_link]
new_text = [item[1] for item in new_link]
for b in new_text:
f.writelines(b + "\n")
for i in new_url:
html = getHTML(i)
pattern = "<tr class='villagetr'><td>.+?</td><td>.+?</td><td>(.+?)</td></tr>"
p = re.compile(pattern)
new_link = p.findall(html)
for b in new_link:
f.writelines(b + "\n")
f.close()
#调用函数
Crawling(new_url,new_text)
import requests
import re
#爬源代码
root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
def getHTML(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
html = r.text
return html
html = getHTML(root_url)
#正则表达式
pattern ="<td><a href='(.+?)'>(.+?)<br/></a></td>"
p = re.compile(pattern)
new_link = p.findall(html)
new_url=[root_url[:root_url.rfind("/")+1]+item[0] for item in new_link]
new_text=[item[1] for item in new_link]
#爬取并写入文件
def Crawling(new_url,new_text):
for i,a in zip(new_url, new_text):
f = open(a + '.txt', 'w', encoding='utf-8')
html = getHTML(i)
pattern = "<tr class='citytr'><td><a href='.+?'>.+?</a></td><td><a href='(.+?)'>(.+?)</a></td>"
p = re.compile(pattern)
new_link = p.findall(html)
new_url = [i[:i.rfind("/")+1]+ item[0] for item in new_link]
new_text = [item[1] for item in new_link]
for b in new_text:
f.writelines(b+"\n")
for i in new_url:
html = getHTML(i)
pattern = "<tr class='countytr'><td><a href='.+?'>.+?</a></td><td><a href='(.+?)'>(.+?)</a></td>"
p = re.compile(pattern)
new_link = p.findall(html)
new_url = [i[:i.rfind("/") + 1] + item[0] for item in new_link]
new_text = [item[1] for item in new_link]
for b in new_text:
f.writelines(b + "\n")
for i in new_url:
html = getHTML(i)
pattern = "<tr class='towntr'><td><a href='.+?'>.+?</a></td><td><a href='(.+?)'>(.+?)</a></td>"
p = re.compile(pattern)
new_link = p.findall(html)
new_url = [i[:i.rfind("/") + 1] + item[0] for item in new_link]
new_text = [item[1] for item in new_link]
for b in new_text:
f.writelines(b + "\n")
for i in new_url:
html = getHTML(i)
pattern = "<tr class='villagetr'><td>.+?</td><td>.+?</td><td>(.+?)</td></tr>"
p = re.compile(pattern)
new_link = p.findall(html)
for b in new_link:
f.writelines(b + "\n")
f.close()
#调用函数
Crawling(new_url,new_text)
(2)
#导入库
import requests
from bs4 import BeautifulSoup
import sqlite3
#创建表
conn = sqlite3.connect('豆瓣.db')
sql_tables = "create table xiaoshuo(id INTEGER primary key autoincrement, title text, author text,score text, people text, content text )"
conn.execute(sql_tables)
conn.commit()
#获取分页链接
root_url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'
}
html = requests.get(root_url,headers=headers).text
page_div = BeautifulSoup(html,"html.parser").select("div.paginator a")
root='https://book.douban.com'
page_urls = [root+item.attrs["href"] for item in page_div]
page_urls.append('https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4')
contents=[]
for url in page_urls:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
#获取每一页源代码
html = requests.get(url,headers = headers).text
bs = BeautifulSoup(html,'html.parser')
# 获取个小说名
title = [item.text.strip() for item in bs.select('h2')]
# 获取每小说作者,出版社,日期,价格
author = [item.text.strip() for item in bs.select('div.pub')]
# 获取每个小说评分
score = [item.text for item in bs.select('span.rating_nums')]
# 获取每个小说人数
people = [item.text.strip() for item in bs.select('span.pl')]
# 获取每个小说内容
content = [item.text for item in bs.select('div.info p')]
contents.append(title+author+score+people+content)
print("开始存入数据库....")
for index in enumerate(contents):
for i in range(20):
if (index==60 and i==4) or (index==140 and i==2) or index==140 and i==10:
content.insert(i,'')
sql = "insert into xiaoshuo values(null, '{}','{}','{}','{}','{}')".format(title[i],author[i],score[i],people[i],content[i])
conn.execute(sql)
conn.commit()
print("恭喜你,存储完毕")
sql_select = 'select * from xiaoshuo'
cursor = conn.cursor()
cursor.execute(sql_select)
conn.commit()
result = cursor.fetchall()
print(result)
sql = 'select title from xiaoshuo where title="活着"'
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
result = cursor.fetchall()
#导入库
import requests
from bs4 import BeautifulSoup
import sqlite3
#创建表
conn = sqlite3.connect('豆瓣.db')
sql_tables = "create table xiaoshuo(id INTEGER primary key autoincrement, title text, author text,score text, people text, content text )"
conn.execute(sql_tables)
conn.commit()
#获取分页链接
root_url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'
}
html = requests.get(root_url,headers=headers).text
page_div = BeautifulSoup(html,"html.parser").select("div.paginator a")
root='https://book.douban.com'
page_urls = [root+item.attrs["href"] for item in page_div]
page_urls.append('https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4')
contents=[]
for url in page_urls:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
#获取每一页源代码
html = requests.get(url,headers = headers).text
bs = BeautifulSoup(html,'html.parser')
# 获取个小说名
title = [item.text.strip() for item in bs.select('h2')]
# 获取每小说作者,出版社,日期,价格
author = [item.text.strip() for item in bs.select('div.pub')]
# 获取每个小说评分
score = [item.text for item in bs.select('span.rating_nums')]
# 获取每个小说人数
people = [item.text.strip() for item in bs.select('span.pl')]
# 获取每个小说内容
content = [item.text for item in bs.select('div.info p')]
contents.append(title+author+score+people+content)
print("开始存入数据库....")
for index in enumerate(contents):
for i in range(20):
if (index==60 and i==4) or (index==140 and i==2) or index==140 and i==10:
content.insert(i,'')
sql = "insert into xiaoshuo values(null, '{}','{}','{}','{}','{}')".format(title[i],author[i],score[i],people[i],content[i])
conn.execute(sql)
conn.commit()
print("恭喜你,存储完毕")
sql_select = 'select * from xiaoshuo'
cursor = conn.cursor()
cursor.execute(sql_select)
conn.commit()
result = cursor.fetchall()
print(result)
sql = 'select title from xiaoshuo where title="活着"'
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
result = cursor.fetchall()