from urllib import request from bs4 import * html=request.urlopen("http://www.5izdh.cn") bs0=BeautifulSoup(html) title=bs0.findAll("a",{"class":"text-dark"}) #等同于 .findAll(class="text-dark"),但是因为class是保留字所以要改成 .findAll(class_="text-dark") for x in title: print(x.get_text())#get_text会把其它东西清除掉,只留下标签中的文字 print(x["href"])#x是个字典
中文链接
from urllib import request from urllib import parse from bs4 import * url0="https://zh.wikipedia.org/wiki/Wikipedia:分類索引" link=parse.quote(url0,'/:?=')#urllib只能接收ascill,第二个参数中字符不会被转换 html=request.urlopen(link) bs0=BeautifulSoup(html) title=bs0.findAll("a") for x in title: if 'href'in x.attrs:#记得加这个,有的有个a标签没有href属性,直接打印会报错 print(x["href"])
PyMySQL
这里用的本地MySQL服务器,需要安装
from urllib import request from urllib import parse from bs4 import * import pymysql url0="http://5izdh.cn/wordpress/" link=parse.quote(url0,'/:?=')#urllib只能接收ascill,第二个参数中字符不会被转换 html=request.urlopen(link) bs0=BeautifulSoup(html) title=bs0.find_all("a",{"class":"text-dark",}) con0= pymysql.connect(host="127.0.0.1",user="zanllp", password="**********", charset='utf8') #,db="scraping",port=3306本地数据库 cur0=con0.cursor() cur0.execute("USE scraping")#看MySQL基本操作 for x in title: sql='INSERT INTO pages (title,content) VALUES("%s","%s")'%(x.get_text(),x["href"]) cur0.execute(sql)#其实这里需要来个try catch的 con0.commit() # 没有设置默认自动提交,需要主动提交,以保存所执行的语句 #cur0.execute("SELECT * from pages ") #print(cur0.fetchall()[0]) cur0.close()#用完记得关 con0.close()
也可以这样
from urllib import request from urllib import parse from bs4 import * import pymysql url0="http://5izdh.cn/wordpress/" link=parse.quote(url0,'/:?=')#urllib只能接收ascill,第二个参数中字符不会被转换 html=request.urlopen(link) bs0=BeautifulSoup(html) title=bs0.find_all("a",{"class":"text-dark",}) content=bs0.find_all("p") con0= pymysql.connect(host="127.0.0.1",user="zanllp", password="********", charset='utf8') #,db="scraping",port=3306 cur0=con0.cursor() cur0.execute("USE scraping") i=0 for x in title: sql="INSERT INTO pages (title,content) VALUES('%s','%s')"%(x.get_text(),content[i].get_text()) cur0.execute(sql)#其实这里需要来个try catch的 con0.commit() # 没有设置默认自动提交,需要主动提交,以保存所执行的语句 print(sql) i+=1 cur0.close() con0.close()
妈的这两天老是分不清select ,delete,老是写成delect,每写一次就丢给我个You have an error in your SQL syntax;
精确的选择标签,先确定一个大的范围,再在这个范围里找
from urllib import request from urllib import parse from bs4 import * import pymysql def FirstURL(url): furl_html=request.urlopen(url) furl_bs=BeautifulSoup(furl_html) #furl=furl_bs.find_all("span",{"class":{"icon-rise","icon-fair","icon-fall"}}) furl=furl_bs.find_all("tr",{"class":"hideline"}) for x in furl: for y in x.find_all("td"):#一个范围只能用一次find,也就是find不能连用 print(y.get_text()) #print(y.a.get_text())#会找出x的后代标签中第一个a标签的文本 FirstURL("http://top.baidu.com/buzz?b=1&fr=tph_right")