#抓取视觉同盟工业设计作品
#插入数据库功能
import requests
import re
import pymysql
from lxml import etree
# 连接数据库
config = {
'host': '1xx.xxx.xx.xx',
'port': 3306, # 端口
'user': 'texxxxg',
'password': 'nxxxxxJ',
'db': 'txxx3',
'charset': 'utf8mb4',
'cursorclass': pymysql.cursors.DictCursor,
}
def cnr(url):
#url="http://www.visionunion.com/article.jsp?code=201903140018"
html=requests.get(url).text
print(html)
#获取标题
btze=r'
(.+?)'bt=re.findall(btze,html,re.S)
bt=bt[0]
bt=bt[:-23]
print(bt)
#获取正文
zwze=r'
(.+?)(责任编辑'zw=re.findall(zwze,html,re.S)
print(zw)
#提取文本
wbnr=etree.HTML(zw[0])
#print(wbnr)
#打印输出
result = etree.tostring(wbnr)
print(result)
wb=wbnr.xpath('//html/body/p/text()')
print(wb)
sjwb=''
for swb in wb:
swb='
' + swb+'
' + '\n'print(swb)
sjwb=sjwb+swb
#获取图片
tpze=r''
tp=re.findall(tpze,zw[0],re.S)
sj = ''
if tp==[]:
tpze = r'src="(.+?)">'
tp = re.findall(tpze, zw[0], re.S)
for tpurl in tp:
tpm = tpurl[-10:]
print(tpm)
turl = '
' + 'www.duoxiqi.cn/sjt/' + tpm + '
' + '\n'print(turl)
sj = sj + turl
else:
for tpurl in tp:
tpurl='http://www.visionunion.com'+tpurl
tpm = tpurl[-10:]
print(tpm)
turl='
'+'www.duoxiqi.cn/sjt/'+tpm+'
'+'\n'print(turl)
sj=sj+turl
#插入数据库
db = pymysql.connect(**config)
cursor = db.cursor()
sql = "INSERT INTO content(title,text) VALUES(%s,%s)"
cursor.execute(sql,(bt,sjwb+'\n'+sj))
db.commit() #提交数据
cursor.close()
db.close()
def lj(url):
#url="http://www.visionunion.com/industry_topic.jsp?query_subtopic=cb&$CURRPAGE$=1"
#http://www.visionunion.com/industry_topic.jsp?query_subtopic=cb&$CURRPAGE$=2
html=requests.get(url).text
#print(html)
nrze=r'
(.+?)