From d368911d340a786d909d31d4ba241e5b4d85c2ae Mon Sep 17 00:00:00 2001
From: Erik Wrenholt <erik@timestretch.com>
Date: Sat, 8 Apr 2023 23:26:14 +0000
Subject: [PATCH 1/2] use an instruct model

---
 .devcontainer/devcontainer.json | 140 ++++++++++++++++----------------
 docker-compose.yml              |   2 +-
 server/app.py                   |   3 +-
 server/entrypoint.sh            |   2 +-
 4 files changed, 74 insertions(+), 73 deletions(-)

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index e2b1bef..38f8dea 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,73 +1,73 @@
 // For format details, see https://aka.ms/devcontainer.json. For config options, see the
 // README at: https://github.com/devcontainers/templates/tree/main/src/miniconda
 {
-	"name": "Miniconda (Python 3)",
-	"image": "mcr.microsoft.com/devcontainers/miniconda:0-3",
-	"features": {
-		"ghcr.io/devcontainers/features/common-utils:2": {
-			"installZsh": true,
-			"configureZshAsDefaultShell": true,
-			"installOhMyZsh": true,
-			"upgradePackages": true,
-			"username": "automatic",
-			"userUid": "automatic",
-			"userGid": "automatic"
-		},
-		"ghcr.io/devcontainers/features/node:1": {
-			"nodeGypDependencies": true,
-			"version": "18"
-		},
-		"ghcr.io/devcontainers/features/docker-in-docker:2": {
-			"version": "latest",
-			"enableNonRootDocker": "true",
-			"moby": "true"
-		},
-		"ghcr.io/stuartleeks/dev-container-features/shell-history:0": {},
-	},
-	"mounts": [
-		// node modules cache
-		"source=${localWorkspaceFolderBasename}-node_modules,target=${containerWorkspaceFolder}/node_modules,type=volume",
-		// models cache
-		{
-			"source": "models",
-			"target": "/models",
-			"type": "volume"
-		},
-		// pip cachce
-		{
-			"source": "${localWorkspaceFolderBasename}-pip-cache",
-			"target": "/home/vscode/.cache/pip",
-			"type": "volume"
-		},
-	],
-	"remoteEnv": {
-		"MODELS_DIR": "/models",
-		"MODEL_FILE": "RWKV-4-Pile-1B5-20220929-ctx4096"
-	},
-	// If you have a gpu this will pass it in!
-	// "runArgs": [
-	// 	"--gpus",
-	// 	"all"
-	// ],
-	// Use 'forwardPorts' to make a list of ports inside the container available locally.
-	"forwardPorts": [
-		3000,
-		8080
-	],
-	"postCreateCommand": "npm i -g diff-so-fancy && conda init zsh && sudo chown -R vscode node_modules ~/.cache",
-	"postStartCommand": "npm install",
-	"customizations": {
-		"vscode": {
-			"extensions": [
-				"dbaeumer.vscode-eslint",
-				"GitHub.copilot",
-				"ms-python.pylint",
-				"ms-python.python",
-				"streetsidesoftware.code-spell-checker",
-			],
-			"settings": {}
-		},
-	},
-	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
-	// "remoteUser": "root"
-}
+    "name": "Miniconda (Python 3)",
+    "image": "mcr.microsoft.com/devcontainers/miniconda:0-3",
+    "features": {
+        "ghcr.io/devcontainers/features/common-utils:2": {
+            "installZsh": true,
+            "configureZshAsDefaultShell": true,
+            "installOhMyZsh": true,
+            "upgradePackages": true,
+            "username": "automatic",
+            "userUid": "automatic",
+            "userGid": "automatic"
+        },
+        "ghcr.io/devcontainers/features/node:1": {
+            "nodeGypDependencies": true,
+            "version": "18"
+        },
+        "ghcr.io/devcontainers/features/docker-in-docker:2": {
+            "version": "latest",
+            "enableNonRootDocker": "true",
+            "moby": "true"
+        },
+        "ghcr.io/stuartleeks/dev-container-features/shell-history:0": {},
+    },
+    "mounts": [
+        // node modules cache
+        "source=${localWorkspaceFolderBasename}-node_modules,target=${containerWorkspaceFolder}/node_modules,type=volume",
+        // models cache
+        {
+            "source": "models",
+            "target": "/models",
+            "type": "volume"
+        },
+        // pip cachce
+        {
+            "source": "${localWorkspaceFolderBasename}-pip-cache",
+            "target": "/home/vscode/.cache/pip",
+            "type": "volume"
+        },
+    ],
+    "remoteEnv": {
+        "MODELS_DIR": "/workspaces/rwkv-web-inference/models",
+        "MODEL_FILE": "RWKV-4-Raven-1B5-v8-Eng-20230408-ctx4096"
+    },
+    // If you have a gpu this will pass it in!
+    // "runArgs": [
+    // 	"--gpus",
+    // 	"all"
+    // ],
+    // Use 'forwardPorts' to make a list of ports inside the container available locally.
+    "forwardPorts": [
+        3000,
+        8080
+    ],
+    "postCreateCommand": "npm i -g diff-so-fancy && conda init zsh && sudo chown -R vscode node_modules ~/.cache",
+    "postStartCommand": "npm install",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "dbaeumer.vscode-eslint",
+                "GitHub.copilot",
+                "ms-python.pylint",
+                "ms-python.python",
+                "streetsidesoftware.code-spell-checker",
+            ],
+            "settings": {}
+        },
+    },
+    // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+    // "remoteUser": "root"
+}
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 4603004..104115f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -6,7 +6,7 @@ services:
     # restart: always
     environment:
       - MODELS_DIR=/models
-      - MODEL_FILE=RWKV-4-Pile-1B5-20220929-ctx4096
+      - MODEL_FILE=RWKV-4-Raven-1B5-v8-Eng-20230408-ctx4096
     volumes:
       - models:/models
     ports:
diff --git a/server/app.py b/server/app.py
index 6c6ff8a..42813a5 100644
--- a/server/app.py
+++ b/server/app.py
@@ -40,7 +40,8 @@
 ########################################################################################################
 
 # args.RUN_DEVICE = "cuda"  # 'cpu' (already very fast) // 'cuda'
-# args.FLOAT_MODE = "bf16" # fp32 (good for cpu) // fp16 (might overflow) // bf16 (less accurate)
+# fp32 (good for cpu) // fp16 (might overflow) // bf16 (less accurate)
+# args.FLOAT_MODE = "fp16"
 
 args.RUN_DEVICE = "cpu"  # 'cpu' (already very fast) // 'cuda'
 # fp32 (good for cpu) // fp16 (might overflow) // bf16 (less accurate)
diff --git a/server/entrypoint.sh b/server/entrypoint.sh
index 70ed285..59d740a 100755
--- a/server/entrypoint.sh
+++ b/server/entrypoint.sh
@@ -11,7 +11,7 @@ fi
 pushd "$MODELS_DIR"
 
 if [ ! -f "${MODEL_FILE}.pth" ]; then
-  wget -q https://huggingface.co/BlinkDL/rwkv-4-pile-1b5/resolve/main/RWKV-4-Pile-1B5-20220929-ctx4096.pth
+  wget -q https://huggingface.co/BlinkDL/rwkv-4-raven/resolve/main/RWKV-4-Raven-1B5-v8-Eng-20230408-ctx4096.pth
 fi
 
 popd

From 200c66a0bc24a49b10daff22a22748abad6e1e25 Mon Sep 17 00:00:00 2001
From: Erik Wrenholt <erik@timestretch.com>
Date: Sun, 9 Apr 2023 14:46:21 +0000
Subject: [PATCH 2/2] got cuda working

---
 .devcontainer/devcontainer.json |  8 ++++----
 scripts/dev.js                  | 32 ++++++++++++++++----------------
 server/app.py                   | 19 +++++++++++++++----
 server/requirements.txt         |  7 +++++--
 4 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 38f8dea..8b684cb 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -45,10 +45,10 @@
         "MODEL_FILE": "RWKV-4-Raven-1B5-v8-Eng-20230408-ctx4096"
     },
     // If you have a gpu this will pass it in!
-    // "runArgs": [
-    // 	"--gpus",
-    // 	"all"
-    // ],
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
     // Use 'forwardPorts' to make a list of ports inside the container available locally.
     "forwardPorts": [
         3000,
diff --git a/scripts/dev.js b/scripts/dev.js
index f95b9d8..f5c7170 100644
--- a/scripts/dev.js
+++ b/scripts/dev.js
@@ -2,25 +2,25 @@ const concurrently = require('concurrently');
 const path = require('node:path');
 
 const { result } = concurrently(
-  [
+    [
+        {
+            command: 'npm run dev',
+            name: 'chat client',
+            cwd: path.resolve(__dirname, '../packages/chat-client'),
+        },
+        {
+            command: "/opt/conda/bin/python app.py 0 -h 0.0.0.0 -p 8080",
+            name: 'chat server',
+            cwd: path.resolve(__dirname, '../server'),
+        }
+    ],
     {
-      command: 'npm run dev',
-      name: 'chat client',
-      cwd: path.resolve(__dirname, '../packages/chat-client'),
-    },
-    {
-      command: "flask run -h 0.0.0.0 -p 8080",
-      name: 'chat server',
-      cwd: path.resolve(__dirname, '../server'),
+        prefix: 'name',
+        killOthers: ['failure', 'success'],
     }
-  ],
-  {
-    prefix: 'name',
-    killOthers: ['failure', 'success'],
-  }
 );
 
 result.catch((error) => {
-  console.error(error);
-  process.exit(1);
+    console.error(error);
+    process.exit(1);
 })
diff --git a/server/app.py b/server/app.py
index 42813a5..acdef28 100644
--- a/server/app.py
+++ b/server/app.py
@@ -28,6 +28,7 @@
     os.environ["CUDA_VISIBLE_DEVICES"] = sys.argv[1]
 except:
     pass
+
 torch.backends.cudnn.benchmark = True
 torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.matmul.allow_tf32 = True
@@ -39,17 +40,23 @@
 # Do this first: pip install torchdynamo
 ########################################################################################################
 
-# args.RUN_DEVICE = "cuda"  # 'cpu' (already very fast) // 'cuda'
+args.RUN_DEVICE = "cuda"  # 'cpu' (already very fast) // 'cuda'
 # fp32 (good for cpu) // fp16 (might overflow) // bf16 (less accurate)
-# args.FLOAT_MODE = "fp16"
+args.FLOAT_MODE = "fp16"
 
-args.RUN_DEVICE = "cpu"  # 'cpu' (already very fast) // 'cuda'
+# args.RUN_DEVICE = "cpu"  # 'cpu' (already very fast) // 'cuda'
 # fp32 (good for cpu) // fp16 (might overflow) // bf16 (less accurate)
-args.FLOAT_MODE = "fp32"
+# args.FLOAT_MODE = "fp32"
 
 # if args.RUN_DEVICE == "cuda":
 #     os.environ["RWKV_RUN_BACKEND"] = 'nvfuser' # !!!BUGGY!!! wrong output
 
+if args.RUN_DEVICE == "cuda":
+    print("cuda device count: ", torch.cuda.device_count())
+    print("current cuda device: ", torch.cuda.current_device())
+    print(torch.cuda.get_device_name(0))
+
+
 TOKEN_MODE = "pile"
 WORD_NAME = [
     "20B_tokenizer.json",
@@ -240,3 +247,7 @@ def help_response():
     response.headers['Access-Control-Allow-Headers'] = 'Origin,X-Requested-With,Content-Type,Accept,Authorization'
 
     return response
+
+
+if __name__ == '__main__':
+    app.run(host="0.0.0.0", port=8080)
diff --git a/server/requirements.txt b/server/requirements.txt
index dfb708b..c58a4d0 100644
--- a/server/requirements.txt
+++ b/server/requirements.txt
@@ -1,4 +1,7 @@
-torch==1.13.1
+torch==1.13.1+cu116
 numpy==1.24.1
 flask==2.2.2
-transformers==4.25.1
\ No newline at end of file
+transformers==4.25.1
+
+# package location
+--find-links https://download.pytorch.org/whl/torch_stable.html
\ No newline at end of file