# 命令行部署
## 前提
LittleBoy前端服务需要依赖Nginx服务
Nginx安装部署请参考:[负载均衡 Nginx安装](../loadbalancing/installation-loadbalancing.rst)。
首先登录到oushu1,然后切换到root用户
``` sh
ssh oushu1
su - root
```
创建一个`serverhost`文件,包含Spark集群中所有的机器
``` sh
cat > ${HOME}/serverhost << EOF
oushu1
oushu2
EOF
```
创建一个`lbhost`文件,包含Spark集群中所有的master机器
``` sh
cat > ${HOME}/lbhost << EOF
oushu1
oushu2
oushu3
EOF
```
在oushu1节点配置yum源,安装lava命令行管理工具
``` sh
# 从yum源所在机器获取repo文件
scp oushu@192.168.1.10:/etc/yum.repos.d/oushu.repo /etc/yum.repos.d/oushu.repo
# 追加yum源所在机器信息到/etc/hosts文件
# 安装lava命令行管理工具
yum clean all
yum makecache
yum install lava
```
oushu1节点和集群内其他节点交换公钥,以便ssh免密码登陆和分发配置文件。
```sh
lava ssh-exkeys -f ${HOME}/lbhost -p ********
```
分发repo文件到其他机器
```sh
lava scp -f ${HOME}/lbhost /etc/yum.repos.d/oushu.repo =:/etc/yum.repos.d
```
## 安装
安装LittleBoy前端服务,只需在oushu1安装即可
```sh
yum install littleboy-fe
```
安装LittleBoy Server/Worker
```sh
lava ssh -f ${HOME}/lbhost -e "yum install -y littleboy"
```
## 配置
### 配置LittleBoy前端服务
`/usr/local/nginx/conf/premise-frontend.conf`配置文件中添加前端nginx服务
``` nginx
# littleboy
server {
listen 1895;
listen [::]:1895;
server_name _;
gzip_static on;
error_page 404 /404.html;
# redirect server error pages to the static page /50x.html
error_page 500 502 503 504 /50x.html;
location = /50x.html {
root html;
}
# common router
location / {
root /usr/local/oushu/littleboy-fe;
try_files $uri $uri/ /index.html;
index index.html;
add_header Cache-Control no-cache;
add_header Access-Control-Allow-Origin *;
add_header Access-Control-Allow-Methods 'GET, POST, OPTIONS';
add_header Access-Control-Allow-Headers 'DNT,X-Mx-ReqToken,Keep-Alive,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Authorization';
}
}
```
`/usr/local/nginx/conf/premise-frontend.conf`配置文件中添加转发规则
``` nginx
# 添加websocket转发地址,在nginx配置文件的最外层
upstream websocket-littleboy {
server 127.0.0.1:1885;
}
# 在lava nginx服务服务中添加转发规则
server {
listen 3000;
server_name localhost;
error_page 404 /404.html;
# ......
# 此处省略其他服务配置
# 以下是需要添加的配置
# littleboy
location ~ ^/api/lava/littleboy(.*) {
proxy_pass http://127.0.0.1:1885/lava/littleboy$1$is_args$args;
proxy_set_header Host $http_host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
location ~ ^/main/littleboy/assets(.*) {
proxy_pass http://127.0.0.1:1895/assets$1$is_args$args;
proxy_set_header Host $http_host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
location ~ ^/ws/lava/littleboy/(.*) {
proxy_pass http://websocket-littleboy/lava/littleboy/$1$is_args$args;
proxy_read_timeout 100s;
proxy_send_timeout 100s;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
}
location ~ ^/lava/littleboy/notebook/(.*) {
proxy_pass http://websocket-littleboy/lava/littleboy/notebook/$1$is_args$args;
proxy_read_timeout 100s;
proxy_send_timeout 100s;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
}
# 以上是添加的配置
# 此处省略其他服务配置
# ......
# redirect server error pages to the static page /50x.html
error_page 500 502 503 504 /50x.html;
location = /50x.html {
root html;
}
# common router
location / {
root /usr/local/oushu/premise-ui-common/lava-fe-core/dist;
try_files $uri $uri/ /index.html;
index index.html;
add_header Cache-Control no-cache;
add_header Access-Control-Allow-Origin *;
add_header Access-Control-Allow-Methods 'GET, POST, OPTIONS';
add_header Access-Control-Allow-Headers 'DNT,X-Mx-ReqToken,Keep-Alive,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Authorization';
}
}
```
### 配置LittleBoy基础服务
配置数据库链接`/usr/local/oushu/littleboy/conf/postgres.config.xml`
```xml
localhost
4432
oushu
*******
littleboy
disable
```
根据实际部署情况修改配置文件`/usr/local/oushu/littleboy/conf/lbserver-site.xml`
```xml
basic.master.hosts
server1,server2,server3
LittleBoy所有server的地址(hostname)
basic.master.lava.protocol
https
Lava REST API服务协议类型
basic.master.lava.host
localhost
Lava REST API地址(hostname)
basic.master.lava.port
443
Lava REST API端口
basic.master.lava.rpc.port
8081
Lava RPC端口
basic.master.admin
https://deployserver:1651
Lava自动部署REST API地址。可选配置,如果没有配置将通过Lava接口查询自动部署REST API地址,如果配置了,将使用配置的地址
basic.master.port
1885
LittleBoy Server REST API端口
gossip.port
1888
LittleBoy Server同步端口
gossip.seed
server1:1888,server2:1888,server3:1888
LittleBoy Server集群同步成员
basic.master.storage
/littleboy
LittleBoy Server HDFS存储路径,用于中间文件,模型文件的存储
basic.master.hdfsuser
oushu
LittleBoy Server HDFS 访问用户
basic.logDir
/usr/local/oushu/log/littleboy
LittleBoy Server日志文件目录
basic.logLevel
info
LittleBoy Server日志级别
basic.storage
/data1/littleboy
LittleBoy Server临时文件目录
```
修改HDFS客户端配置文件`/usr/local/oushu/littleboy/conf/hdfs-client.xml`
``` xml
rpc.client.timeout
3600000
rpc.client.connect.tcpnodelay
true
rpc.client.max.idle
10000
rpc.client.ping.interval
10000
rpc.client.connect.timeout
600000
rpc.client.connect.retry
10
rpc.client.read.timeout
3600000
rpc.client.write.timeout
3600000
rpc.client.socket.linger.timeout
-1
dfs.client.read.shortcircuit
true
dfs.default.replica
3
dfs.prefetchsize
10
dfs.client.failover.max.attempts
15
dfs.default.blocksize
134217728
dfs.client.log.severity
INFO
input.connect.timeout
600000
input.read.timeout
3600000
input.write.timeout
3600000
input.localread.default.buffersize
2097152
input.localread.blockinfo.cachesize
1000
input.read.getblockinfo.retry
3
output.replace-datanode-on-failure
false
output.default.chunksize
512
output.default.packetsize
65536
output.default.write.retry
10
output.connect.timeout
600000
output.read.timeout
3600000
output.write.timeout
3600000
output.packetpool.size
1024
output.close.timeout
900000
dfs.domain.socket.path
/var/lib/hadoop-hdfs/dn_socket
dfs.client.use.legacy.blockreader.local
false
dfs.ha.namenodes.oushu
nn1,nn2
dfs.namenode.http-address.oushu.nn1
namenode1:50070
dfs.namenode.http-address.oushu.nn2
namenode2:50070
dfs.namenode.rpc-address.oushu.nn1
namenode1:9000
dfs.namenode.rpc-address.oushu.nn2
namenode2:9000
dfs.nameservices
oushu
```
在`/usr/local/oushu/littleboy/conf/littleboy-env.sh`文件中修改环境变量JAVA_HOME
```sh
#!/usr/bin/env bash
this="${BASH_SOURCE-$0}"
export LITTLEBOY_HOME=$(cd -- "$(dirname -- "$this")/.." && pwd -P)
export DEPENDENCE_HOME=/usr/local/oushu/littleboy-dependence
platform=`uname`
# might need config manually START
export LB_DEVICE_TYPE=CPU # need to restart process for predict, but willcome into force immediately in the next train. # Should check multi-devicecluster availability.
export JAVA_HOME=${JAVA_HOME}
# config for hdfs client
export LIBHDFS3_CONF=${LITTLEBOY_HOME}/conf/hdfs-client.xml
# might need config manually END
export JARS_DIR=${LITTLEBOY_HOME}/jars
export PYTHON_BIN=${DEPENDENCE_HOME}/conda3/bin/python3
export LIBHDFS_PATH=${DEPENDENCE_HOME}/lib/libhdfs.so
export ARROW_LIBHDFS_DIR=${DEPENDENCE_HOME}/lib
DEVICE_LIB=""
if [[ "$LB_DEVICE_TYPE" == "GPU" ]];then
DEVICE_LIB="/gpu/lib"
export PATH=${DEPENDENCE_HOME}/conda3/envs/gpu/bin:$PATH
else
export PATH=${DEPENDENCE_HOME}/conda3/bin:$PATH
fi
if [[ "$platform" == "Linux" ]];then
export LD_LIBRARY_PATH=${JAVA_HOME}/jre/lib/amd64/server:${DEPENDENCE_HOME}/lib:${LITTLEBOY_HOME}/lib${DEVICE_LIB}:${LD_LIBRARY_PATH+:$LD_LIBRARY_PATH}
else
export DYLD_LIBRARY_PATH=${JAVA_HOME}/jre/lib/amd64/server:${DEPENDENCE_HOME}/lib:${LITTLEBOY_HOME}/lib${DEVICE_LIB}:${LD_LIBRARY_PATH+:$LD_LIBRARY_PATH}
fi
export CUDA_VISIBLE_DEVICES=0
```
### 配置LittleBoy计算集群
根据实际部署情况修改配置文件`/usr/local/oushu/littleboy/conf/lbworker-site.xml`
```xml
basic.master.hosts
server1:1885,server2:1885,server3:1885
LittleBoy Server地址
basic.worker.port
1891
LittleBoy Worker REST API端口
basic.enablepmmlserver
true
是否启动pmml模型服务
basic.logDir
/usr/local/oushu/log/littleboy/
LittleBoy Worker日志文件目录
basic.logLevel
info
LittleBoy Worker日志级别
basic.storage
/data1/littleboy
LittleBoy Worker临时文件目录
```
## 启动
### 启动Nginx服务
启动或重新加载Nginx服务
```sh
#启动nginx
nginx
#重新加载
nginx -s reload
```
### 启动LittleBoy基础服务
登录oushu1节点
```
ssh oushu1
su - root
```
执行以下操作以启动LittleBoy基础服务
```sh
lava ssh -f ${HOME}/serverhost -e "sudo -u oushu /usr/local/oushu/littleboy/sbin/littleboy start master"
```
### 启动LittleBoy计算集群
执行以下操作以启动LittleBoy计算集群
```sh
lava ssh -f ${HOME}/lbhost -e "sudo -u oushu /usr/local/oushu/littleboy/sbin/littleboy start worker"
```
## 检查状态
```sh
ps -ef | grep littleboy
```
## 常用命令
```sh
# 停止基础服务
lava ssh -f ${HOME}/serverhost -e "sudo -u oushu /usr/local/oushu/littleboy/sbin/littleboy stop master"
# 停止计算集群
lava ssh -f ${HOME}/lbhost -e "sudo -u oushu /usr/local/oushu/littleboy/sbin/littleboy stop worker"
```
## 注册到Skylab(可选)
在oushu1节点修改lava命令行工具配置中skylab的节点ip
```
vi /usr/local/oushu/lava/conf/server.json
```
编写注册request到一个文件,例如~/lbworker-register.json
```json
{
"data": {
"name": "LB-Worker",
"group_roles": [
{
"role": "littleboy.worker",
"cluster_name": "lbworker",
"group_name": "worker1",
// 安装的机器信息,需要在lava-admin元数据中
"machines": [
{
"id": 1,
"name": "hostname1",
"subnet": "lava",
"data_ip": "127.0.0.1",
"manage_ip": "",
"assist_port": 1622,
"ssh_port": 22
}
]
}
],
"config": {
"lbworker-site.xml": [
{
"key": "basic.worker.port",
"value": "1891"
},
// 以下是LittleBoy计算集群依赖的Spark集群配置信息
{
"key": "spark.master.rest.port",
"value": "2881"
},
{
"key": "SPARK_MASTER_HOSTS",
"value": "master1,master2" // LittleBoy计算集群依赖的Spark集群master hosts
},
{
"key": "SPARK_HISTORY_UI_PORT",
"value": "2884"
},
{
"key": "SPARK_MASTER_PORT",
"value": "2882"
},
{
"key": "SPARK_MASTER_WEBUI_PORT",
"value": "2883"
},
{
"key": "SPARK_WORKER_WEBUI_PORT",
"value": "2885"
}
]
}
}
}
```
上述配置文件中,需要根据实际情况修改machines数组中的机器信息,在平台基础组件lava所安装的机器执行:
```
psql lavaadmin -p 4432 -U oushu -c "select m.id,m.name,s.name as subnet,m.private_ip as data_ip,m.public_ip as manage_ip,m.assist_port,m.ssh_port from machine as m,subnet as s where m.subnet_id=s.id;"
```
获取到所需的机器信息,根据服务角色对应的节点,将机器信息添加到machines数组中。
例如oushu1对应littleboy worker节点,那么oushu1的机器信息需要备添加到littleboy.worker角色对应的machines数组中。
调用lava命令注册集群:
```
lava login -u oushu -p ******** -T {租户id}
lava onprem-register service -s LBWorker -f ~/lbworker-register.json
```
如果返回值为:
```
Add service by self success
```
则表示注册成功,如果有错误信息,请根据错误信息处理。
同时,从页面登录后,在自动部署模块对应服务中可以查看到新添加的集群,同时列表中会实时监控Spark进程在机器上的状态。