#0716------------
'''
import csv
csvFile = open("csv test","w+")
try:
writer = csv.writer(csvFile)
writer.writerow(('number','number plus 2','number times 3'))
for i in range(10):
writer.writerow((i,i+2,i*2))
finally:
csvFile.close()
'''
'''
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("https://www.runoob.com/html/html-tables.html")
bs = BeautifulSoup(html,"html.parser")
table = bs.findAll('table',{'class':'reference'})[0]
rows = table.findAll('tr')
csvFile = open("csv_test","wt+")
writer = csv.writer(csvFile)
try:
for row in rows:
csvRow = []
for cell in row.findAll(['td','th']):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
finally:
csvFile.close()
'''
# use scraping ----创建事务 sql语句
'''
import pymysql,random,datetime,re
from urllib.request import urlopen
from bs4 import BeautifulSoup
#创建链接
conn = pymysql.connect(
host = "127.0.0.1",
unix_socket = "/tmp/mysql.sock",
user = "root",
password = "root",
db = "gly",
charset = "utf-8"
)
cur = conn.curser()#创建游标
cur.execute("use scraping")#执行
random.seed(datetime.datetime.now())
#存储
def store(title,content):
cur.execute("insert into pages(title,content) values ('%s','%s'),(title,content)")
cur.connection.commit()
#爬数据
def getLinks(articleUrl):
html = urlopen("http://en.wikipedia.org"+articleUrl)
bs0bj = BeautifulSoup(html,"html.parser")
title = bs0bj.find("h1").get_text()
content = bs0bj.find("div",{"id":"mw-content-text"}).find("p").get_text()
store(title,content)
return bs0bj.find("div",{"id":"babyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))
links = getLinks("/wiki/kevin_Bacon")
try:
while len(links) > 0:
newArticle = links[random.randint(0,len(links)-1)].attrs["href"]
print(newArticle)
links = getLinks(newArticle)
finally:
cur.close()
conn.close()
'''
'''
import smtplib
from email.mime.text import MIMEText
def send_mail(username,passwd,recv,content):
mailserver = "smtp.qq.com"
username_send = username
password = passwd
username_recv = recv
mail = MIMEText(content,_subtype = 'html',_charset = 'utf-8')
mail['Subject'] = '工资条'
mail['From'] = username_send
mail['To'] = username_recv
smtp = smtplib.SMTP(mailserver,port=587)
smtp.login(username_send,password)
smtp.sendmail(username_send,username_recv,mail.as_string())
smtp.quit()
print("successful")
send_mail("756347780@qq.com","15045491532q","756347780@qq.com","你好")
'''
'''
import smtplib
from email.mime.text import MIMEText
def send_mail(username, passwd, recv, content):
mailserver = "smtp.163.com" # 邮箱服务器地址
username_send = username # 邮箱用户名
password = passwd # 邮箱密码:需要使用授权码
username_recv = recv # 收件人,多个收件人用逗号隔开
mail = MIMEText(content, _subtype='html', _charset='utf-8')
mail['Subject'] = '工资条'
mail['From'] = username_send # 发件人
mail['To'] = username_recv # 收件人;[]里的三个是固定写法,别问为什么,我只是代码的搬运工
smtp = smtplib.SMTP(mailserver, port=25) # 连接邮箱服务器,smtp的端口号是25
smtp.login(username_send, password) # 登录邮箱
smtp.sendmail(username_send, username_recv, mail.as_string()) # 参数分别是发送者,接收者,第三个是把上面的发送邮件的内容变成字符串
smtp.quit() # 发送完毕后退出smtp
print('发送成功')
send_mail("15045491532@163.com","333355wo","15045491532@163.com","你好")
'''
#读取纯文本
'''
from urllib.request import urlopen
textPage = urlopen("http://www.pythonscraping.com/pages/warandpeace/chapter1-ru.txt")
print(str(textPage.read(),'utf-8'))
'''
#读取csv
'''
from urllib.request import urlopen
from io import StringIO
import csv
data = urlopen("http://www.pythonscraping.com/files/MontyPythonAlbums.csv")\
.read().decode("ascii","ignore")
dataFile = StringIO(data)
csvReader = csv.reader(dataFile)
for row in csvReader:
print(row)
'''
#q去除头信息
'''
from urllib.request import urlopen
from io import StringIO
import csv
data = urlopen("http://www.pythonscraping.com/files/MontyPythonAlbums.csv")\
.read().decode("ascii","ignore")
dataFile = StringIO(data)
csvReader = csv.DictReader(dataFile)
print(csvReader.fieldnames)
print("---------------")
for row in csvReader:
print(row)
'''
#读取pdf----pdfminer(pdfminer3k)
'''
from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager,process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr,retstr,laparams=laparams)
process_pdf(rsrcmgr,device,pdfFile)
device.close()
content = retstr.getvalue()
retstr.close()
return content
pdfFile = urlopen("http://www.pythonscraping.com/pages/warandpeace/chapter1.pdf")
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()
'''
#读取word .docx
'''
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
#读取数据
wordFile = urlopen("http://www.pythonscraping.com/pages/AWordDocument.docx").read()
wordFile = BytesIO(wordFile)#转字节
document = ZipFile(wordFile)#解压
xml_content = document.read('word/document.xml')#转xml格式
print(xml_content.decode("utf-8"))
'''
#解析BeautifulSoup
#w:t==========word标签
'''
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
from bs4 import BeautifulSoup
wordFile = urlopen("http://www.pythonscraping.com/pages/AWordDocument.docx").read()
wordFile = BytesIO(wordFile)
document = ZipFile(wordFile)
xml_content = document.read('word/document.xml')
print(xml_content)
#wordobj = BeautifulSoup(xml_content.decode("utf-8"),'xml')
#testStrings = wordobj.findAll("w:t")
#for textElem in testStrings:
#print(textElem.text)
'''
暑期 ---------0714爬虫
最新推荐文章于 2022-08-09 13:11:50 发布