信息采集(30条) http://www.qian$$$lima.com/zbgg/ 1.可以实现1-5页面的灵活爬取 2.本地txt存储
import requests
from lxml import etree
class WYSpider:
def __init__(self, p):
# http://www.qian$$$lima.com/zbgg/p1
self.host = f'http://www.qian$$$lima.com/zbgg/p{p}'
self.file_name = f"p{p}"
self.head = {
'User-Agent': '用自己的'
}
def getHtml(self):
response = requests.get(url=self.host, headers=self.head)
response.encoding = 'utf-8'
if response.status_code == 200:
return response.content.decode() # !!!
else:
print('请求失败,请重新输入')
self.getHtml()
def parseHtml(self, content):
a = []
href = []
et = etree.HTML(content) # xpath解析
a_s = et.xpath('//*[@id="__layout"]/div/div[2]/div/div[2]/div[1]/div[1]'
'/div[2]/div/div/div/div/div/a[2]')
print("---", a_s)
for each in a_s:
a.append(each.text)
# a.append(each.text.encode('utf-8').decode('unicode')) # wrong
print(a) # 乱码 ???
href_s = et.xpath('//*[@id="__layout"]/div/div[2]/div/div[2]/div[1]/div[1]'
'/div[2]/div/div/div/div/div/a[2]/@href')
print(href_s)
for each in href_s:
href.append(each)
return zip(a, href)
def save(self, a_and_href):
file = open(self.file_name + ".txt", mode='w', encoding='utf-8')
for k, v in a_and_href:
file.write(k + ' : ' + v + '\n')
file.flush()
file.close()
print('END!', file)
for i in range(1, 6):
app = WYSpider(i)
app.save(app.parseHtml(app.getHtml()))