暑期 ---------0714爬虫

#0716------------
'''
import csv
csvFile = open("csv test","w+")
try:
    writer = csv.writer(csvFile)
    writer.writerow(('number','number plus 2','number times 3'))
    for i in range(10):
        writer.writerow((i,i+2,i*2))
finally:
    csvFile.close()
'''

'''
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("https://www.runoob.com/html/html-tables.html")
bs = BeautifulSoup(html,"html.parser")

table = bs.findAll('table',{'class':'reference'})[0]
rows = table.findAll('tr')

csvFile = open("csv_test","wt+")
writer = csv.writer(csvFile)
try:
    for row in rows:
        csvRow = []
        for cell in row.findAll(['td','th']):
            csvRow.append(cell.get_text())
        writer.writerow(csvRow)
finally:
    csvFile.close()
'''

#  use scraping  ----创建事务  sql语句
'''
import pymysql,random,datetime,re
from urllib.request import urlopen
from bs4 import BeautifulSoup

#创建链接
conn = pymysql.connect(
    host = "127.0.0.1",
    unix_socket = "/tmp/mysql.sock",
    user = "root",
    password = "root",
    db = "gly",
    charset = "utf-8"
)
cur = conn.curser()#创建游标
cur.execute("use scraping")#执行
random.seed(datetime.datetime.now())

#存储
def store(title,content):
    cur.execute("insert into pages(title,content) values ('%s','%s'),(title,content)")
    cur.connection.commit()
#爬数据
def getLinks(articleUrl):
    html = urlopen("http://en.wikipedia.org"+articleUrl)
    bs0bj = BeautifulSoup(html,"html.parser")
    title = bs0bj.find("h1").get_text()
    content = bs0bj.find("div",{"id":"mw-content-text"}).find("p").get_text()
    store(title,content)
    return bs0bj.find("div",{"id":"babyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))
links = getLinks("/wiki/kevin_Bacon")
try:
    while len(links) > 0:
        newArticle = links[random.randint(0,len(links)-1)].attrs["href"]
        print(newArticle)
        links = getLinks(newArticle)
finally:
    cur.close()
    conn.close()
'''

'''
import smtplib
from email.mime.text import MIMEText

def send_mail(username,passwd,recv,content):
    mailserver = "smtp.qq.com"
    username_send = username
    password = passwd
    username_recv = recv
    mail = MIMEText(content,_subtype = 'html',_charset = 'utf-8')

    mail['Subject'] = '工资条'
    mail['From'] = username_send
    mail['To'] = username_recv
    smtp = smtplib.SMTP(mailserver,port=587)
    smtp.login(username_send,password)
    smtp.sendmail(username_send,username_recv,mail.as_string())
    smtp.quit()
    print("successful")

send_mail("756347780@qq.com","15045491532q","756347780@qq.com","你好")
'''

'''
import smtplib
from email.mime.text import MIMEText
def send_mail(username, passwd, recv, content):
    mailserver = "smtp.163.com"  # 邮箱服务器地址
    username_send = username  # 邮箱用户名
    password = passwd  # 邮箱密码:需要使用授权码
    username_recv = recv  # 收件人,多个收件人用逗号隔开
    mail = MIMEText(content, _subtype='html', _charset='utf-8')

    mail['Subject'] = '工资条'
    mail['From'] = username_send  # 发件人
    mail['To'] = username_recv  # 收件人;[]里的三个是固定写法,别问为什么,我只是代码的搬运工
    smtp = smtplib.SMTP(mailserver, port=25)  # 连接邮箱服务器,smtp的端口号是25
    smtp.login(username_send, password)  # 登录邮箱
    smtp.sendmail(username_send, username_recv, mail.as_string())  # 参数分别是发送者,接收者,第三个是把上面的发送邮件的内容变成字符串
    smtp.quit()  # 发送完毕后退出smtp
    print('发送成功')
send_mail("15045491532@163.com","333355wo","15045491532@163.com","你好")
'''

#读取纯文本
'''
from urllib.request import urlopen

textPage = urlopen("http://www.pythonscraping.com/pages/warandpeace/chapter1-ru.txt")
print(str(textPage.read(),'utf-8'))
'''

#读取csv
'''
from urllib.request import urlopen
from io import StringIO
import csv

data = urlopen("http://www.pythonscraping.com/files/MontyPythonAlbums.csv")\
    .read().decode("ascii","ignore")
dataFile = StringIO(data)
csvReader = csv.reader(dataFile)
for row in csvReader:
    print(row)
'''

#q去除头信息
'''
from urllib.request import urlopen
from io import StringIO
import csv

data = urlopen("http://www.pythonscraping.com/files/MontyPythonAlbums.csv")\
    .read().decode("ascii","ignore")
dataFile = StringIO(data)
csvReader = csv.DictReader(dataFile)
print(csvReader.fieldnames)
print("---------------")
for row in csvReader:
    print(row)
'''

#读取pdf----pdfminer(pdfminer3k)
'''
from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager,process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open

def readPDF(pdfFile):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr,retstr,laparams=laparams)

    process_pdf(rsrcmgr,device,pdfFile)
    device.close()

    content = retstr.getvalue()
    retstr.close()
    return content
pdfFile = urlopen("http://www.pythonscraping.com/pages/warandpeace/chapter1.pdf")
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()
'''

#读取word .docx
'''
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO

#读取数据
wordFile = urlopen("http://www.pythonscraping.com/pages/AWordDocument.docx").read()
wordFile = BytesIO(wordFile)#转字节
document = ZipFile(wordFile)#解压
xml_content = document.read('word/document.xml')#转xml格式
print(xml_content.decode("utf-8"))
'''

#解析BeautifulSoup
#w:t==========word标签
'''
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
from bs4 import BeautifulSoup

wordFile = urlopen("http://www.pythonscraping.com/pages/AWordDocument.docx").read()
wordFile = BytesIO(wordFile)
document = ZipFile(wordFile)
xml_content = document.read('word/document.xml')

print(xml_content)
#wordobj = BeautifulSoup(xml_content.decode("utf-8"),'xml')
#testStrings = wordobj.findAll("w:t")

#for textElem in testStrings:
    #print(textElem.text)
'''
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值