python 爬虫（2）BeautifulSoup&PyMySQL

最新推荐文章于 2021-05-30 06:29:49 发布

zanllp

最新推荐文章于 2021-05-30 06:29:49 发布

阅读量283

点赞数

分类专栏： MySQL python

本文链接：https://blog.csdn.net/zanllp/article/details/80371204

版权

MySQL 同时被 2 个专栏收录

7 篇文章 0 订阅

订阅专栏

python

6 篇文章 0 订阅

订阅专栏

from urllib  import request
from bs4  import *
html=request.urlopen("http://www.5izdh.cn")
bs0=BeautifulSoup(html)
title=bs0.findAll("a",{"class":"text-dark"})
#等同于 .findAll(class="text-dark"),但是因为class是保留字所以要改成 .findAll(class_="text-dark")
for x in title:
    print(x.get_text())#get_text会把其它东西清除掉，只留下标签中的文字
    print(x["href"])#x是个字典

中文链接

from urllib  import request
from urllib  import parse
from bs4  import *
url0="https://zh.wikipedia.org/wiki/Wikipedia:分類索引"
link=parse.quote(url0,'/:?=')#urllib只能接收ascill,第二个参数中字符不会被转换
html=request.urlopen(link)
bs0=BeautifulSoup(html)
title=bs0.findAll("a")
for x in title:
    if 'href'in x.attrs:#记得加这个，有的有个a标签没有href属性，直接打印会报错
         print(x["href"])

PyMySQL

这里用的本地MySQL服务器，需要安装

from urllib  import request
from urllib  import parse
from bs4  import *
import pymysql

url0="http://5izdh.cn/wordpress/"
link=parse.quote(url0,'/:?=')#urllib只能接收ascill,第二个参数中字符不会被转换
html=request.urlopen(link)
bs0=BeautifulSoup(html)
title=bs0.find_all("a",{"class":"text-dark",})
con0= pymysql.connect(host="127.0.0.1",user="zanllp", password="**********", charset='utf8')  #,db="scraping",port=3306本地数据库
cur0=con0.cursor()
cur0.execute("USE scraping")#看MySQL基本操作

for x in title:
    sql='INSERT INTO pages (title,content) VALUES("%s","%s")'%(x.get_text(),x["href"])
    cur0.execute(sql)#其实这里需要来个try catch的
    con0.commit() # 没有设置默认自动提交，需要主动提交，以保存所执行的语句


#cur0.execute("SELECT * from pages ")
#print(cur0.fetchall()[0])

cur0.close()#用完记得关
con0.close()

也可以这样

from urllib  import request
from urllib  import parse
from bs4  import *
import pymysql

url0="http://5izdh.cn/wordpress/"
link=parse.quote(url0,'/:?=')#urllib只能接收ascill,第二个参数中字符不会被转换
html=request.urlopen(link)
bs0=BeautifulSoup(html)
title=bs0.find_all("a",{"class":"text-dark",})
content=bs0.find_all("p")
con0= pymysql.connect(host="127.0.0.1",user="zanllp", password="********", charset='utf8')  #,db="scraping",port=3306
cur0=con0.cursor()
cur0.execute("USE scraping")
i=0
for x in title:
    sql="INSERT INTO pages (title,content) VALUES('%s','%s')"%(x.get_text(),content[i].get_text())
    cur0.execute(sql)#其实这里需要来个try catch的
    con0.commit() # 没有设置默认自动提交，需要主动提交，以保存所执行的语句
    print(sql)
    i+=1

cur0.close()
con0.close()

妈的这两天老是分不清select ,delete,老是写成delect，每写一次就丢给我个You have an error in your SQL syntax;

精确的选择标签，先确定一个大的范围，再在这个范围里找

from urllib  import request
from urllib  import parse
from bs4  import *
import pymysql
def FirstURL(url):
    furl_html=request.urlopen(url)
    furl_bs=BeautifulSoup(furl_html)
    #furl=furl_bs.find_all("span",{"class":{"icon-rise","icon-fair","icon-fall"}})
    furl=furl_bs.find_all("tr",{"class":"hideline"})
    for x in furl:
        for y in x.find_all("td"):#一个范围只能用一次find，也就是find不能连用
            print(y.get_text())
            #print(y.a.get_text())#会找出x的后代标签中第一个a标签的文本

FirstURL("http://top.baidu.com/buzz?b=1&fr=tph_right")