学习了urllib库
import urllib.request,urllib.parse from lxml import etree import re domain = "http://www.ssme.sh.gov.cn" url = "http://www.ssme.sh.gov.cn/public/search!productList.do" header = {"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",\ "Accept-Encoding":"gzip, deflate","Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7",\ "Connection":"keep-alive","Content-Type":"application/x-www-form-urlencoded","Cookie":"userName=%u4E0A%u6D77%u79FB%u52A8; NTKF_T2D_CLIENTID=guest5854E8D0-B295-3D14-2777-AE30ACA765C8; ssmevisit=d3ce576d-6285-47c5-9d96-c4b376121d3f; JSESSIONID=B540D1C65BEC09A8CA504F7056B30B36; nTalk_CACHE_DATA={uid:fw_1000_ISME9754_guest5854E8D0-B295-3D,tid:1566534429812073}; ssmevisittemp=cea07490-9887-47a3-beed-c04d9e39c9f1",\ "Host":"www.ssme.sh.gov.cn","Upgrade-Insecure-Requests":"1","User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36}"} for num in range(1,171): data = {} data["pageNo"] = str(num) postdata = urllib.parse.urlencode(data).encode('utf-8') request = urllib.request.Request(url,headers=header,method="POST",data= postdata) response = urllib.request.urlopen(request) html = etree.HTML(response.read()) find = html.xpath('//div[@class="g_img"]/a/@href') for item in find: s = str(item) searchObject = re.search(r'/.*',s) if searchObject: with open('download/url_data.txt','a') as f: f.write(domain+searchObject.group()+'\n')
正则中^和$是对句子的开头和结尾进行匹配,也就是说句子的开始必须得是某个字符,比如^a只能匹配ab,而在‘bac’中匹配不了任何东西。
编码不统一的时候可以先用txt文本文件打开,转换代码之后即可解除乱码,字符串编码是使得字符串变为字节数据,而解码操作是将字节数据转换成字符串的过程
文件可以持续写入需要把文件的打开模式变为‘a’
import urllib.request,urllib.parse from lxml import etree import re,csv import time header = {"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",\ "Accept-Encoding":"gzip, deflate","Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7",\ "Connection":"keep-alive","Cookie":"userName=%u4E0A%u6D77%u79FB%u52A8; NTKF_T2D_CLIENTID=guest5854E8D0-B295-3D14-2777-AE30ACA765C8; ssmevisit=d3ce576d-6285-47c5-9d96-c4b376121d3f; JSESSIONID=B540D1C65BEC09A8CA504F7056B30B36; nTalk_CACHE_DATA={uid:fw_1000_ISME9754_guest5854E8D0-B295-3D,tid:1566534429812073}; ssmevisittemp=351ecff8-92d2-422c-be98-5752ae312bcc",\ "Host":"www.ssme.sh.gov.cn","Upgrade-Insecure-Requests":"1","User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36}"} for line in open('download/url_data.txt','r'): l = list() url = line request = urllib.request.Request(url, headers=header, method="GET") response = urllib.request.urlopen(request) html = etree.HTML(response.read()) uname = html.xpath('//div[@class="cs_c_info"]//tr[2]/td[1]/text()') ucell = html.xpath('//div[@class="cs_c_info"]//tr[2]/td[2]/text()') utel = html.xpath('//div[@class="cs_c_info"]//tr[3]/td[2]/text()') umail = html.xpath('//div[@class="cs_c_info"]//tr[4]/td/text()') company = html.xpath('//div[@class="cs_shop_bg clear"]/span/text()') place = html.xpath('//div[@class="cs_shop_info"]//tr[3]/td/text()') if uname: l.append(uname[0]) if ucell: l.append(ucell[0]) if utel: l.append(utel[0]) if umail: l.append(umail[0]) if company: l.append(company[0]) if place: l.append(place[0]) print(l) out = open('Stu_csv.csv', 'a', newline='\n',encoding='utf-8') csv_write = csv.writer(out, dialect='excel') csv_write.writerow(l) time.sleep(3)
python的re(正则模块)中search方法和match方法的区别在于前者匹配整个句子,后者仅仅从句子开头匹配若不符合则不与匹配。