from requests import get
from lxml import etree #用xpath爬
import csv # 逗号分隔值
import os ## 判断文件是否存在,并创建文件
from sqlalchemy import create_engine,MetaData,Table #与数据库
from sqlalchemy import String,Integer,Column #
from urllib import parse ## print(parse.unquote('url'))解码url中文
import re #正则表达式
engine=create_engine(
"mysql+pymysql://root:root@127.0.0.1:3306/test",
max_overflow=5,
pool_size=10,
echo=True,
)
def get_html(url):
html=get(url)
if html.status_code==200:
###########用正则表达式趴
# print('ok')
# pattern_title=re.compile('.<div class="result.*?<h3.*?<a(.*?)</a>',re.I|re.S)
# pattern_url=re.compile('<div class="result.*?<h3.*?<a.*?}"href="(.*?)".*?',re.I|re.S)
# url=pattern_url.findall(html.text)
# for u in url:
# if u:
# print(u)
# else:
# print('error')
#
# res=pattern_title.findall(html.text)
# # res=re.findall(pattern,html.text)
# for r in res:
# titel=re.sub('[<>/-= \n-{ \'\0\n\t "}]+','',r)
# print(titel)
soup=etree.HTML(html.text)
titles=soup.xpath('//div[contains(@class,"result c-container new-pmd")]')
for t in titles:
title=t.xpath('h3/a//text()')
print(''.join(title))
url=t.xpath('h3/a//@href')[0]
print(url)
####################将网站保存在csv,text中
# with open('test.csv/t.text','a+',encoding='utf-8')as f:
# f.write(''.join(title)+'\n')
# f.write(url+'\n')
####################保存在mysql中
engine.execute("insert into user8(name,url) values('{0}','{1}')".format(''.join(title),url))
else:
print('error')
if __name__=="__main__":
filename='test.csv'
if not os.path.exists(filename):
os.makedir(filename)
metadata=MetaData()
user8=Table('user8',metadata,
Column("id",Integer,primary_key=True,autoincrement=True),
Column("name",String(128)),
Column("url",String(255)))
metadata.create_all(engine)
url=['http://www.baidu.com/s?wd=chemy怎么读&pn={}'.format(i) for i in range(0,600,10)]
for u in url:
# print(parse.unquote(url))
get_html(u)
复习1(2020.9.23)
最新推荐文章于 2024-10-01 05:04:32 发布