Excel单元格使用xlwings包调用python函数的公式，截取子网页（标题）的试验问题求助CSDN

本文链接：https://blog.csdn.net/qq_27206435/article/details/103915581

Excel单元格使用xlwings包调用python函数的公式，截取子网页（标题）的试验问题求助CSDN

Python 环境：python3.7 的conda上的py3环境

Excel 2010

Excel单元格布置

D114='http://mp.weixin.qq.com/s?__biz=MzU2MTgxNTE1Nw==&mid=2247484671&idx=2&sn=177e74610bd48593e6a45efb81dddc21&chksm=fc724fe5cb05c6f3df76ef025a6d8ee2c63d5385b96b2f5c13931cd3f9d46d1469486d452239&mpshare=1&scene=23&srcid=&sharer_sharetime=1574561625912&sharer_shareid=d333ff324a43c5c77fa866d86816c0fe#rd
'

E114=get_title(D114,"h2","{'class_':'rich_media_title'}")

使用的源码：文章链接索引清单简化.py

import xlwings as xw

from bs4 import BeautifulSoup
import requests
import codecs
import os
import re
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}





@xw.sub  # only required if you want to import it or run it via UDF Server
def main():
    wb = xw.Book.caller()
    wb.sheets[0].range("A1").value = "Hello xlwings!"


@xw.func
def hello(name):
    return "hello {0}".format(name)


@xw.func
def sum_two(a,b):
    return a+b

@xw.func
@xw.arg('item', doc='This is item that is found.')
@xw.arg('attr', doc='This is an attribute in dictionary  {key:value,...}.')
# @xw.arg('key', doc='This is key that is attribute.')
# @xw.arg('value', doc='This is value of the attribute above.')
def get_title(url,item,attr):#(url,item,{key:value,...}s):
    html = requests.get(url,params=headers)#补充params=
    # html = requests.request('GET',url,headers)
    soup = BeautifulSoup(html.text, 'lxml') #原文.content
    x1 =soup.find(name=item,attrs=attr)
	
    url1='https://mp.weixin.qq.com/s?__biz=MzU2MTgxNTE1Nw==&mid=2247484031&idx=2&sn=e916e47c2f6fa55703f0a3e25c1a860e&chksm=fc724965cb05c073ad331b41bb385701895f3e8f33766b5ba9ad35f78215f96003fd6f544d95&mpshare=1&scene=23&srcid=&sharer_sharetime=1574578858152&sharer_shareid=d333ff324a43c5c77fa866d86816c0fe#rd'
    html1 = requests.get(url1,headers)
    markup='''
	
	'''
	    #1=html1
		#2='<h1 style="display:inline">深度揭秘铁路 12306 的架构</h1>'
    markup1=''' <h2 class="rich_media_title" id="activity-name"> Gartner : 2019年十大数据与分析技术趋势 </h2>''' 
    markup2='''
            <div id="img-content" class="rich_media_wrp">
                
                <h2 class="rich_media_title" id="activity-name">                              Gartner : 2019年十大数据与分析技术趋势
                </h2>'''
	#注意class关键词为python保留，外部调用需要避讳，写在程序里的倒是无妨。
    soup1= BeautifulSoup(markup2, 'lxml') #soup1.text测试正常,对markup1
	                                      #soup1.prettify()测试正常  
   
	# return测试，attrs=attr传递正常  attr={'class':'rich_media_title'}
	
    x11=soup1.find(name=item,attrs=attr)  #item='h2'  ;attr={'class':'rich_media_title'} #find(name='h2',attrs={'class':'rich_media_title'})#测试正常
 
    return  x1.text#soup1.text#x11.text#content#x1.text#len(x11)#x11.text


url1='https://mp.weixin.qq.com/s?__biz=MzU2MTgxNTE1Nw==&mid=2247484031&idx=2&sn=e916e47c2f6fa55703f0a3e25c1a860e&chksm=fc724965cb05c073ad331b41bb385701895f3e8f33766b5ba9ad35f78215f96003fd6f544d95&mpshare=1&scene=23&srcid=&sharer_sharetime=1574578858152&sharer_shareid=d333ff324a43c5c77fa866d86816c0fe#rd'
# print('测试截取的网页:',get_title(url1,item,attr))
# get_title(url1,item,attr)

if __name__ == "__main__":
    xw.books.active.set_mock_caller()
    main()
    # get_title(url1,item,attr)
    # print('测试截取的网页:',get_title(url1,item,attr))

#先导入 from 文章链接索引清单 import get_title；再调用get_title(url,item,attr)；
    #return x1.text#.string #len(x1)#.string 都试过
	#重点：获取tag内容有三种方法，string方法不能处理标签中含有标签的内容，获取为None；text和get_text可以获取。
	#tag.string，soup1.string 返回空
	#type(soup1.string)，type(tag.string)   #出错 "Unexpected Python Error: TypeError: Objects of type 'type' can not be converted to a COM VARIANT"
	#soup1.name返回[document]
	#.get_text()[0],返回空
	#1，测试可以正常返回
	# x1这才是完整的元素，.get_text()和.get都不完整,都是截取的

试验结果征求错误解决方法，Excel公式调用的python函数期望能正常返回抓取的微信页面标题

"    return  x
  File ""e:\鏂板叴it銆乨t棰嗗煙璧勬枡\dt\澶ф暟鎹椂浠ｅ皬鏁版嵁鍒嗘瀽-28.5g\澶ф暟鎹功绫?璁哄潧绛夎祫婧愮粍缁嘰寰俊鍏紬鍙疯祫婧?閾炬帴娓呭崟\澶ф暟鎹垎鏋愭寲鎺樺拰python鏈哄櫒瀛︿範 鍏紬鍙锋枃绔犻摼鎺ョ储寮曟竻鍗昞鏂囩珷閾炬帴绱㈠紩娓呭崟\鏂囩珷閾炬帴绱㈠紩娓呭崟.py"", line 3140, in get_title
    ret = func(*args)
  File ""D:\ProgramData\Miniconda2\envs\py3\lib\site-packages\xlwings\udfs.py"", line 314, in call_udf
    res = call_udf(script, fname, args, this_workbook, FromVariant(caller))
  File ""D:\ProgramData\Miniconda2\envs\py3\lib\site-packages\xlwings\server.py"", line 198, in CallUDF
    return func(*args)
  File ""D:\ProgramData\Miniconda2\envs\py3\lib\site-packages\win32com\server\policy.py"", line 586, in _invokeex_
    return S_OK, -1, self._invokeex_(dispid, lcid, wFlags, args, None, None)
  File ""D:\ProgramData\Miniconda2\envs\py3\lib\site-packages\win32com\server\policy.py"", line 283, in _invoke_
    return self._invoke_(dispid, lcid, wFlags, args)
  File ""D:\ProgramData\Miniconda2\envs\py3\lib\site-packages\win32com\server\policy.py"", line 278, in _Invoke_"

命令行执行效果良好

>python -c "import 文章链接索引清单简化; url='https://mp.weixin.qq.com/s?__biz=MzU2MTgxNTE1Nw==&mid=2247484671&idx=2&sn=177e74610bd48593e6a45efb81dddc21&chksm=fc724fe5cb05c6f3df76ef025a6d8ee2c63d5385b96b2f5c13931cd3f9d46d1469486d452239&mpshare=1&scene=23&srcid=&sharer_sharetime=1574561625912&sharer_shareid=d333ff324a43c5c77fa866d86816c0fe#rd';item= 'h2' ;attr={'class':'rich_media_title'};s=文章链接索引清单简化.get_title(url,item,attr);print('s=',s)"

试验结果

s=



北大开源了Python中文分词工具包，准确度远超Jieba


(py3) E:\新兴IT、DT领域资料\DT\大数据时代小数据分析-28.5G\大数据书籍 论坛等资源 组织\微信公众号资源 链接清单\大数据分析挖掘和Python机器学习 公众号文章链接索引清单\文章链接索引清单>

Excel单元格使用xlwings包调用python函数的公式，截取子网页（标题）的试验 问题求助CSDN

Excel单元格使用xlwings包调用python函数的公式，截取子网页（标题）的试验 问题求助CSDN

Excel单元格布置

使用的源码：文章链接索引清单简化.py

试验结果 征求错误解决方法，Excel公式调用的python函数期望能正常返回抓取的微信页面标题

命令行执行 效果良好

试验结果

Excel单元格使用xlwings包调用python函数的公式，截取子网页（标题）的试验问题求助CSDN

Excel单元格使用xlwings包调用python函数的公式，截取子网页（标题）的试验问题求助CSDN

试验结果征求错误解决方法，Excel公式调用的python函数期望能正常返回抓取的微信页面标题

命令行执行效果良好