12.9 pytorch version of chatglm2-6b deployment

I. Pre-requisites

has completed the installation of the operating system (ubuntu, ctyunos, openeuler, kylin and other systems)
the server has been connected to the external network access (non-essential, easy to download the package, if not, then manually upload)
Recommended disk space > 1TB.
prepare docker installation package, docker image, code, pre-training weights, data sets (links will be provided in the subsequent preparation) Second, the application deployment
docker deployment #Download the installation package

wget https://download.docker.com/linux/static/stable/aarch64/docker-18.09.8.tgz --no-check-certificate

#Unzip and install

tar xvpf docker-18.09.8.tgz
cp -p -f docker/* /usr/bin

#Prepare the environment

#For Docker to work properly, the firewall also needs to be turned off.

setenforce 0
systemctl stop firewalld
systemctl disable firewalld

#Configure the docker.service service

vim /usr/lib/systemd/system/docker.service

#Press I to go to edit and paste the following

[Unit]
Description=Docker Application Container Engine
Documentation=http://docs.docker.com
After=network.target docker.socket
[Service]
Type=notify
EnvironmentFile=-/run/flannel/docker
WorkingDirectory=/usr/local/bin
ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:4243 -H unix:///var/run/docker.sock --selinux-enabled=false --log-opt max-size=1g
ExecReload=/bin/kill -s HUP
#Having non-zero Limit*s causes performance problems due to accounting overhead
#in the kernel. We recommend using cgroups to do container-local accounting. 
LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity
#Uncomment TasksMax if your systemd version supports it. 
#Only systemd 226 and above support this version. 
#TasksMax=infinity
TimeoutStartSec=0
#set delegate yes so that systemd does not reset the cgroups of docker containers 
Delegate=yes
#kill only the docker process, not all processes in the cgroup 
KillMode=process
Restart=on-failure
[Install]
WantedBy=multi-user.target

#Start up related services

systemctl daemon-reload
systemctl status docker
systemctl restart docker
systemctl status docker
systemctl enable docker

#Note! After startup, you need to make sure that docker's file path has enough space, usually under /usr by default, if there is not enough space, please replace the path according to the following operation

vi /etc/docker/daemon.json

#Add the following, note that modifying the following path /home/docker is a sample

{
    "data-root":"/home/docker"
}

#Exit wq

sudo systemctl daemon-reload
sudo systemctl restart docker

Preparation of resources

cd /home/work
#docker image download
wget https://czy.obs.cn-east-324.fjaicc.com/chatglm2-6b-pytorch/images/chatglm2-6b-pytorch.tar
#Load the image
docker load -i chatglm2-6b-pytorch.tar
#Code (with dataset and weights) Download
wget https://czy.obs.cn-east-324.fjaicc.com/chatglm2-6b-pytorch/code/ModelZoo-PyTorch.tar.gz
#code decompression
tar -zxvf ModelZoo-PyTorch.tar.gz

Start the container

#Start the container here please note that the contents of the <???> change

docker run -itd -u root --ipc=host \
--device=/dev/davinci0 \
--device=/dev/davinci1 \
--device=/dev/davinci2 \
--device=/dev/davinci3 \
--device=/dev/davinci4 \
--device=/dev/davinci5 \
--device=/dev/davinci6 \
--device=/dev/davinci7 \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
-v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
-v /usr/local/sbin/:/usr/local/sbin/ \
-v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
-v /var/log/npu/slog/:/var/log/npu/slog \
-v /var/log/npu/profiling/:/var/log/npu/profiling \
-v /var/log/npu/dump/:/var/log/npu/dump \
-v /var/log/npu/:/usr/slog \
-v <The work paths mapped in the host contain code weight datasets inside them such as:/home/work/>:/home/work \
--name <Name of the container> \
chatglm2-6b-pytorch:v1.0 \
/bin/bash

4.Enter the container and turn on fine tuning

docker exec -it <container ID> bash
cd /home/work/ModelZoo-PyTorch/PyTorch/built-in/foundation/ChatGLM2-6B/ptuning/

#ptuning v2 Single card fine-tuning

bash train.sh

#8 card full parameter fintune

bash ds_train_finetune.sh

#Lora fine-tuning

bash ds_train_lora.sh