19106225贾奎滢

最新推荐文章于 2021-07-27 22:53:09 发布

柠檬汽水白胖胖

最新推荐文章于 2021-07-27 22:53:09 发布

阅读量68

点赞数

本文链接：https://blog.csdn.net/weixin_50655618/article/details/116140410

版权

(1)去中国国家统计网中国现在34个省区市所有的省-市-区-镇/乡名称，即行政区划最小到镇或乡村或农场一级。并将结果按各省或直辖市保存到文本文件中，每个省或直辖市一个文件。链接地址为:https://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html
(2)豆瓣网小说网页前10页所有小说的信息，包括每一部小说的如下信息（作者、小说名、
出版社、出版日期、价格、评分、评分人数、内容简介)，并将这些信息存储到数据库中。数据库不限于sqlite，一部小说一条记录。最后需要实现查询某部小说信息的功能。网页地址为:https://book.douban.com/tag/小说
答：

（1）

import requests
import re
#爬源代码
root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
def getHTML(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    r = requests.get(url, headers=headers)
    r.encoding = r.apparent_encoding
    html = r.text
    return html
html = getHTML(root_url)
#正则表达式
pattern ="<td><a href='(.+?)'>(.+?)<br/></a></td>"
p = re.compile(pattern)
new_link = p.findall(html)
new_url=[root_url[:root_url.rfind("/")+1]+item[0] for item in new_link]
new_text=[item[1] for item in new_link]
#爬取并写入文件
def Crawling(new_url,new_text):
    for i,a in zip(new_url, new_text):
        f = open(a + '.txt', 'w', encoding='utf-8')
        html = getHTML(i)
        pattern = "<tr class='citytr'><td><a href='.+?'>.+?</a></td><td><a href='(.+?)'>(.+?)</a></td>"
        p = re.compile(pattern)
        new_link = p.findall(html)
        new_url = [i[:i.rfind("/")+1]+ item[0] for item in new_link]
        new_text = [item[1] for item in new_link]
        for b in new_text:
            f.writelines(b+"\n")

        for i in new_url:
            html = getHTML(i)
            pattern =  "<tr class='countytr'><td><a href='.+?'>.+?</a></td><td><a href='(.+?)'>(.+?)</a></td>"
            p = re.compile(pattern)
            new_link = p.findall(html)
            new_url = [i[:i.rfind("/") + 1] + item[0] for item in new_link]
            new_text = [item[1] for item in new_link]
            for b in new_text:
                f.writelines(b + "\n")
            for i in new_url:
                html = getHTML(i)
                pattern = "<tr class='towntr'><td><a href='.+?'>.+?</a></td><td><a href='(.+?)'>(.+?)</a></td>"
                p = re.compile(pattern)
                new_link = p.findall(html)
                new_url = [i[:i.rfind("/") + 1] + item[0] for item in new_link]
                new_text = [item[1] for item in new_link]
                for b in new_text:
                    f.writelines(b + "\n")
                for i in new_url:
                    html = getHTML(i)
                    pattern = "<tr class='villagetr'><td>.+?</td><td>.+?</td><td>(.+?)</td></tr>"
                    p = re.compile(pattern)
                    new_link = p.findall(html)
                    for b in new_link:
                        f.writelines(b + "\n")
        f.close()
#调用函数
Crawling(new_url,new_text)

import requests
import re
#爬源代码
root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
def getHTML(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
html = r.text
return html
html = getHTML(root_url)
#正则表达式
pattern ="<td><a href='(.+?)'>(.+?)<br/></a></td>"
p = re.compile(pattern)
new_link = p.findall(html)
new_url=[root_url[:root_url.rfind("/")+1]+item[0] for item in new_link]
new_text=[item[1] for item in new_link]
#爬取并写入文件
def Crawling(new_url,new_text):
for i,a in zip(new_url, new_text):
f = open(a + '.txt', 'w', encoding='utf-8')
html = getHTML(i)
pattern = "<tr class='citytr'><td><a href='.+?'>.+?</a></td><td><a href='(.+?)'>(.+?)</a></td>"
p = re.compile(pattern)
new_link = p.findall(html)
new_url = [i[:i.rfind("/")+1]+ item[0] for item in new_link]
new_text = [item[1] for item in new_link]
for b in new_text:
f.writelines(b+"\n")

for i in new_url:
html = getHTML(i)
pattern = "<tr class='countytr'><td><a href='.+?'>.+?</a></td><td><a href='(.+?)'>(.+?)</a></td>"
p = re.compile(pattern)
new_link = p.findall(html)
new_url = [i[:i.rfind("/") + 1] + item[0] for item in new_link]
new_text = [item[1] for item in new_link]
for b in new_text:
f.writelines(b + "\n")
for i in new_url:
html = getHTML(i)
pattern = "<tr class='towntr'><td><a href='.+?'>.+?</a></td><td><a href='(.+?)'>(.+?)</a></td>"
p = re.compile(pattern)
new_link = p.findall(html)
new_url = [i[:i.rfind("/") + 1] + item[0] for item in new_link]
new_text = [item[1] for item in new_link]
for b in new_text:
f.writelines(b + "\n")
for i in new_url:
html = getHTML(i)
pattern = "<tr class='villagetr'><td>.+?</td><td>.+?</td><td>(.+?)</td></tr>"
p = re.compile(pattern)
new_link = p.findall(html)
for b in new_link:
f.writelines(b + "\n")
f.close()
#调用函数
Crawling(new_url,new_text)

（2）

#导入库
import requests
from bs4 import BeautifulSoup
import sqlite3

#创建表
conn = sqlite3.connect('豆瓣.db')
sql_tables = "create table xiaoshuo(id INTEGER primary key autoincrement, title text, author text,score text, people text, content text )"
conn.execute(sql_tables)
conn.commit()

#获取分页链接
root_url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'
    }
html = requests.get(root_url,headers=headers).text
page_div = BeautifulSoup(html,"html.parser").select("div.paginator a")
root='https://book.douban.com'
page_urls = [root+item.attrs["href"] for item in page_div]
page_urls.append('https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4')

contents=[]
for url in page_urls:
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
    #获取每一页源代码
    html = requests.get(url,headers = headers).text
    bs = BeautifulSoup(html,'html.parser')
    # 获取个小说名
    title = [item.text.strip() for item in bs.select('h2')]
    # 获取每小说作者，出版社，日期，价格
    author = [item.text.strip() for item in bs.select('div.pub')]
    # 获取每个小说评分
    score = [item.text for item in bs.select('span.rating_nums')]
    # 获取每个小说人数
    people = [item.text.strip() for item in bs.select('span.pl')]
    # 获取每个小说内容
    content = [item.text for item in bs.select('div.info p')]
    contents.append(title+author+score+people+content)


print("开始存入数据库....")
for index in enumerate(contents):
     for i in range(20):
        if (index==60 and i==4) or (index==140 and i==2) or index==140 and i==10:
            content.insert(i,'')
        sql = "insert into xiaoshuo values(null, '{}','{}','{}','{}','{}')".format(title[i],author[i],score[i],people[i],content[i])
        conn.execute(sql)
        conn.commit()
print("恭喜你，存储完毕")

sql_select = 'select * from xiaoshuo'
cursor = conn.cursor()
cursor.execute(sql_select)
conn.commit()
result = cursor.fetchall()
print(result)

sql = 'select title from xiaoshuo where title="活着"'
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
result = cursor.fetchall()

#导入库
import requests
from bs4 import BeautifulSoup
import sqlite3

#创建表
conn = sqlite3.connect('豆瓣.db')
sql_tables = "create table xiaoshuo(id INTEGER primary key autoincrement, title text, author text,score text, people text, content text )"
conn.execute(sql_tables)
conn.commit()

#获取分页链接
root_url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'
}
html = requests.get(root_url,headers=headers).text
page_div = BeautifulSoup(html,"html.parser").select("div.paginator a")
root='https://book.douban.com'
page_urls = [root+item.attrs["href"] for item in page_div]
page_urls.append('https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4')

contents=[]
for url in page_urls:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
#获取每一页源代码
html = requests.get(url,headers = headers).text
bs = BeautifulSoup(html,'html.parser')
# 获取个小说名
title = [item.text.strip() for item in bs.select('h2')]
# 获取每小说作者，出版社，日期，价格
author = [item.text.strip() for item in bs.select('div.pub')]
# 获取每个小说评分
score = [item.text for item in bs.select('span.rating_nums')]
# 获取每个小说人数
people = [item.text.strip() for item in bs.select('span.pl')]
# 获取每个小说内容
content = [item.text for item in bs.select('div.info p')]
contents.append(title+author+score+people+content)

print("开始存入数据库....")
for index in enumerate(contents):
for i in range(20):
if (index==60 and i==4) or (index==140 and i==2) or index==140 and i==10:
content.insert(i,'')
sql = "insert into xiaoshuo values(null, '{}','{}','{}','{}','{}')".format(title[i],author[i],score[i],people[i],content[i])
conn.execute(sql)
conn.commit()
print("恭喜你，存储完毕")

sql_select = 'select * from xiaoshuo'
cursor = conn.cursor()
cursor.execute(sql_select)
conn.commit()
result = cursor.fetchall()
print(result)

sql = 'select title from xiaoshuo where title="活着"'
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
result = cursor.fetchall()

柠檬汽水白胖胖

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
19106225贾奎滢

(1)去中国国家统计网爬取中国现在34个省区市所有的省-市-区-镇/乡名称，即行政区划最小到镇或乡村或农场一级。并将爬取结果按各省或直辖市保存到文本文件中，每个省或直辖市一个文件。链接地址为:https://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html(2)爬取豆瓣网小说网页前10页所有小说的信息，包括每一部小说的如下信息（作者、小说名、出版社、出版日期、价格、评分、评分人数、内容简介)，并将这些信息存储到数据库中。数据库不限于s
复制链接

扫一扫