# coding: utf-8
# In[3]:
import requests
from lxml import etree
import re
from sqlalchemy import create_engine
engine = create_engine('sqlite:///xskh_pachong.sqlite',echo=True)
from sqlalchemy import Table,Column,Integer,String,ForeignKey
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
DBSession = sessionmaker(bind=engine)
session = DBSession()
class xskh_pachong(Base):
__tablename__="xskh_pachong"
id = Column(Integer,primary_key=True)
aname = Column(String(128))
telephone = Column(String(126))
info = Column(String(400))
http = Column(String(128))
@classmethod
def save(cls,data):
session.add(data)
session.commit()
return data.id
Base.metadata.create_all(engine)
import requests
from lxml import etree
import re
from sqlalchemy import create_engine
def get_aname_and_telephone(url,company_name,telephone):
if url == '':
return
try:
response = requests.get(url)
except:
return
try:
content = response.content.decode('gbk','strict')
except:
content = response.content.decode('utf-8','strict')
html = etree.HTML(content)
#..//html/body//span//text()
a = html.xpath('''..//html/body//span//text()''')
pattern_mob = re.compile('1[3|4|5|7|8]\d{9}')
telephone.append(pattern_mob.findall("".join(a)))
#..//html/body//a//text()
b = html.xpath('''..//html/body//text()''')
pattern_company = re.compile('[\u4e00-\u9fa5]{5,23}[司厂]{1}')
company_name.append(pattern_company.findall("".join(b)))
import requests
from lxml import etree
import re
from sqlalchemy import create_engine
def get_by_word(wd):
headersParameters = { #发送HTTP请求时的HEAD信息,用于伪装为浏览器
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
# pn = 换页 wd 是关键字
# wd = '佛山 装饰 联系人'
pn = 0;
baidu_url = 'https://www.baidu.com'
houzui = '/s?wd={}&pn={}'
url = baidu_url+houzui.format(wd,pn)
response = requests.get(url, timeout=60,headers = headersParameters)
count = 1
while True:
content = response.content.decode()
html = etree.HTML(content)
#//div[@id="content_left"]/div[@id="1"]/h3//text()
#//div[@id="content_left"]/div[@id="1"]//div[@class="c-abstract"]//text()
#//div[@id="content_left"]/div[@id="10"]//h3//text()
#//div[@id="content_left"]/div/@id
items = html.xpath('''//div[@id="content_left"]/div/@id''')
for i in items:
a = html.xpath('''//div[@id="content_left"]/div[@id="{}"]//h3//text()'''.format(i))
b = html.xpath('''//div[@id="content_left"]/div[@id="{}"]//div[@class="c-abstract"]//text()'''.format(i))
c = html.xpath('''//div[@id="content_left"]/div[@id="{}"]/h3/a/@href'''.format(i))
aname = "".join(a)
content =aname+' '+ "".join(b)
http = "".join(c)
pattern_mob = re.compile('1[3|4|5|7|8]\d{9}')
telephone = pattern_mob.findall(content)
pattern_company = re.compile('[\u4e00-\u9fa5]{5,23}[司厂]{1}')
company_name = pattern_company.findall(content)
if "".join(company_name)!="":
aname = company_name[0]
else:
pass
try:
get_aname_and_telephone(http,company_name,telephone)
company_name = company_name[0]
telephone = telephone[0]
aname = companay_name[0]
except:
aname = ''
if "".join(telephone)!="":
pass
else:
pass
print(http,company_name,telephone)
try:
get_aname_and_telephone(http,company_name,telephone)
company_name = company_name[0]
telephone = telephone[0]
except:
telephone =[]
for i in telephone:
dbobj = xskh_pachong()
dbobj.aname =aname
if i !='1':
dbobj.telephone = i
dbobj.info =content
dbobj.http =http
dbobj.save(dbobj)
count = count + 1
else:
dbobj.telephone = ''.join(telephone)
dbobj.info =content
dbobj.http =http
dbobj.save(dbobj)
count = count + 1
break
pass
ptext = html.xpath('''//*[@id="page"]/a[@class='n']//text()''')
purl = html.xpath('''//*[@id="page"]/a[@class='n']//@href''')
pgtext = "".join(ptext)
if pgtext.find('下一页')<0:
break
if pgtext.find('上一页下一页')>0:
pgurl = purl[1]
else:
pgurl = purl[0]
url = baidu_url+pgurl
#下一页地址
response = requests.get(url, timeout=60,headers = headersParameters)
print(count)
with open('地点.txt', mode='r+', encoding='utf-8') as faddr:
addr = faddr.readlines()
with open('行业.txt', mode='r+', encoding='utf-8') as fhangye:
hangye = fhangye.readlines()
with open('关键字.txt', mode='r+', encoding='utf-8') as fkey:
key = fkey.readlines()
for item_addr in addr:
for item_hangye in hangye:
for item_key in key:
wd = item_addr.replace('\n',' ')+item_hangye.replace('\n',' ')+item_key.replace('\n',' ')
get_by_word(wd)
python 实现百度关键字自动爬虫
最新推荐文章于 2022-07-11 17:40:04 发布