python数据抓取

最新推荐文章于 2021-01-14 08:36:00 发布

mush_me

最新推荐文章于 2021-01-14 08:36:00 发布

阅读量201

点赞数

分类专栏： python 文章标签：数据库 python

本文链接：https://blog.csdn.net/mush_me/article/details/84758643

版权

python 专栏收录该内容

17 篇文章 0 订阅

订阅专栏

经过折腾，python果然是数据抓取的利器。
火车头类的东西，以后我怕是用不上了，用python自己写，速度快而且透明。
数据抓取的大概步骤
1.抓取列表页面链接
2.根据链接，抓取内容，并保存到数据库

以下是cnblog的新闻为例，使用sqlite做数据库（python内置的，当然是首选的）
1.getnewsList.py


# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
import urllib.request
from urllib import request
# 导入SQLite驱动:
import sqlite3


def saveNews(newsLink,newsTitle=None):
    if newsTitle is None:
	    newsTitle=""
    conn = sqlite3.connect('news.db')
    cursor = conn.cursor()
    # 执行一条SQL语句，创建user表:
    cursor.execute('create table IF NOT EXISTS news (id INTEGER PRIMARY KEY, title varchar(100),link vachar(100),content text,status Integer)')
    cursor.execute('select * from news where link=\''+newsLink+'\'')
    values=cursor.fetchall()
    if len(values) > 0:#链接以前就存在
        print('链接已经存在:'+newsLink)
    else:
        cursor.execute('insert into news (title, link,status) values (\''+newsTitle+'\', \''+newsLink+'\',0)')
        print("save success."+newsTitle+":"+newsLink)    
# 关闭Cursor:
    cursor.close()
# 提交事务:
    conn.commit()
# 关闭Connection:
    conn.close()

def readNews():
    conn = sqlite3.connect('news.db')
    cursor = conn.cursor()
    cursor.execute('select * from news')
    values = cursor.fetchall()
    #print(values)
    cursor.close()
    conn.close()


#1.获取页面内容html
with request.urlopen('http://news.cnblogs.com/') as f:
    html_doc=f.read()

#2.分析页面内容，获取标题内容和链接[格式如下]
#<h2 class="news_entry">
#	<a href="/n/535728/" target="_blank">传Windows 10 Mobile Build 11088下月初发布</a>
#</h2>
soup = BeautifulSoup(html_doc,"html.parser")
news_array=soup.find_all('h2', {'class': 'news_entry'})
for news in news_array:
    #print(news.a.get("href"))#获取链接
    #print(news.a.string)#获取标题
    saveNews("http://news.cnblogs.com"+news.a.get("href"),news.a.string)

#readNews()

2.getnewsContent.py


# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
import urllib.request
from urllib import request
# 导入SQLite驱动:
import sqlite3


def updateNewsContent():
    conn = sqlite3.connect('news.db')
    cursor = conn.cursor()
    cursor.execute('select * from news where status=0')
    values = cursor.fetchall()

    for line in values:
        id=line[0]
        link=line[2]
        content=getNewsContent(link)
        cursor.execute('update news set content=?,status=1 where id=?',(content, id))
    cursor.close()
    conn.commit()
    conn.close()

#根据链接获取内容
def getNewsContent(newsLink):
    #1.获取页面内容html
    with request.urlopen(newsLink) as f:
        html_doc=f.read()

    #2.分析页面内容，获取内容
    soup = BeautifulSoup(html_doc,"html.parser")
    news_content=soup.find('div', {'id': 'news_body'})
    s=news_content.contents
    text=''
    for x in s:
        text=text+str(x)
    return text

#将所有没有内容的新闻，抓取一下，将内容填充进去
updateNewsContent()