From 987999b14815aba6f7d416dae7e83d6bfa9e160b Mon Sep 17 00:00:00 2001
From: soulteary <soulteary@gmail.com>
Date: Sat, 25 Mar 2023 13:35:00 +0800
Subject: [PATCH 1/5] feat: docker support, and simple webui

---
 docker/Dockerfile |  22 ++++++++
 docker/webui.py   | 128 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 150 insertions(+)
 create mode 100644 docker/Dockerfile
 create mode 100644 docker/webui.py
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 00000000..dbad6074
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,22 @@
+FROM nvcr.io/nvidia/pytorch:23.02-py3
+LABEL org.opencontainers.image.authors="soulteary@gmail.com"
+
+RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
+    pip install huggingface_hub
+WORKDIR /app
+RUN cat > /get-models.py <<EOF
+from huggingface_hub import hf_hub_download
+title = "RWKV-4-Pile-14B-20230313-ctx8192-test1050"
+model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-14b", filename=f"{title}.pth")
+EOF
+
+RUN python /get-models.py && \
+    rm -rf /get-models.py && \
+    pip install ninja rwkv==0.6.2 gradio
+
+RUN git clone https://huggingface.co/spaces/BlinkDL/ChatRWKV-gradio/ ChatRWKV && \
+    cd ChatRWKV && git checkout 699bb36437cd15ef8da05c4f392f4ab3ea6be65d
+WORKDIR /app/ChatRWKV
+
+COPY docker/webui.py /app/ChatRWKV/app.py
+CMD ["python", "/app/ChatRWKV/app.py"]
\ No newline at end of file
diff --git a/docker/webui.py b/docker/webui.py
new file mode 100644
index 00000000..ef2bc4ac
--- /dev/null
+++ b/docker/webui.py
@@ -0,0 +1,128 @@
+# modify https://huggingface.co/spaces/BlinkDL/ChatRWKV-gradio/blob/main/app.py
+import gradio as gr
+import os, gc, torch
+from datetime import datetime
+from huggingface_hub import hf_hub_download
+from pynvml import *
+nvmlInit()
+gpu_h = nvmlDeviceGetHandleByIndex(0)
+ctx_limit = 1024
+title = "RWKV-4-Pile-14B-20230313-ctx8192-test1050"
+desc = f'''Links:
+<a href='https://github.com/BlinkDL/ChatRWKV' target="_blank" style="margin:0 0.5em">ChatRWKV</a>
+<a href='https://github.com/BlinkDL/RWKV-LM' target="_blank" style="margin:0 0.5em">RWKV-LM</a>
+<a href="https://pypi.org/project/rwkv/" target="_blank" style="margin:0 0.5em">RWKV pip package</a>
+'''
+
+os.environ["RWKV_JIT_ON"] = '1'
+os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
+
+from rwkv.model import RWKV
+model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-14b", filename=f"{title}.pth")
+model = RWKV(model=model_path, strategy='cuda fp16i8 *20 -> cuda fp16')
+from rwkv.utils import PIPELINE, PIPELINE_ARGS
+pipeline = PIPELINE(model, "20B_tokenizer.json")
+
+def infer(
+        ctx,
+        token_count=10,
+        temperature=1.0,
+        top_p=0.8,
+        presencePenalty = 0.1,
+        countPenalty = 0.1,
+):
+    args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
+                     alpha_frequency = countPenalty,
+                     alpha_presence = presencePenalty,
+                     token_ban = [0], # ban the generation of some tokens
+                     token_stop = []) # stop generation whenever you see any token here
+
+    ctx = ctx.strip(' ')
+    if ctx.endswith('\n'):
+        ctx = f'\n{ctx.strip()}\n'
+    else:
+        ctx = f'\n{ctx.strip()}'
+
+    gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
+    print(f'vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')
+    
+    all_tokens = []
+    out_last = 0
+    out_str = ''
+    occurrence = {}
+    state = None
+    for i in range(int(token_count)):
+        out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
+        for n in args.token_ban:
+            out[n] = -float('inf')
+        for n in occurrence:
+            out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
+
+        token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
+        if token in args.token_stop:
+            break
+        all_tokens += [token]
+        if token not in occurrence:
+            occurrence[token] = 1
+        else:
+            occurrence[token] += 1
+        
+        tmp = pipeline.decode(all_tokens[out_last:])
+        if '\ufffd' not in tmp:
+            out_str += tmp
+            yield out_str.strip()
+            out_last = i + 1
+    gc.collect()
+    torch.cuda.empty_cache()
+    yield out_str.strip()
+
+examples = [
+    ["Expert Questions & Helpful Answers\nAsk Research Experts\nQuestion:\nHow can we eliminate poverty?\n\nFull Answer:\n", 150, 1.0, 0.7, 0.2, 0.2],
+    ["Here's a short cyberpunk sci-fi adventure story. The story's main character is an artificial human created by a company called OpenBot.\n\nThe Story:\n", 150, 1.0, 0.7, 0.2, 0.2],
+    ['''Below is an instruction that describes a task. Write a response that appropriately completes the request.
+### Instruction:
+Generate a list of adjectives that describe a person as brave.
+### Response:
+''', 150, 1.0, 0.2, 0.5, 0.5],
+    ['''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+### Instruction:
+Arrange the given numbers in ascending order.
+### Input:
+2, 4, 0, 8, 3
+### Response:
+''', 150, 1.0, 0.2, 0.5, 0.5],
+    ["Ask Expert\n\nQuestion:\nWhat are some good plans for world peace?\n\nExpert Full Answer:\n", 150, 1.0, 0.7, 0.2, 0.2],
+    ["Q & A\n\nQuestion:\nWhy is the sky blue?\n\nDetailed Expert Answer:\n", 150, 1.0, 0.7, 0.2, 0.2],
+    ["Dear sir,\nI would like to express my boundless apologies for the recent nuclear war.", 150, 1.0, 0.7, 0.2, 0.2],
+    ["Here is a shell script to find all .hpp files in /home/workspace and delete the 3th row string of these files:", 150, 1.0, 0.7, 0.1, 0.1],
+    ["Building a website can be done in 10 simple steps:\n1.", 150, 1.0, 0.7, 0.2, 0.2],
+    ["A Chinese phrase is provided: 百闻不如一见。\nThe masterful Chinese translator flawlessly translates the phrase into English:", 150, 1.0, 0.5, 0.2, 0.2],
+    ["I believe the meaning of life is", 150, 1.0, 0.7, 0.2, 0.2],
+    ["Simply put, the theory of relativity states that", 150, 1.0, 0.5, 0.2, 0.2],
+]
+
+
+iface = gr.Interface(
+    fn=infer,
+    description=f'''{desc} *** <b>Please try examples first (bottom of page)</b> *** (edit them to use your question). Demo limited to ctxlen {ctx_limit}.''',
+    allow_flagging="never",
+    inputs=[
+        gr.Textbox(lines=10, label="Prompt", value="Here's a short cyberpunk sci-fi adventure story. The story's main character is an artificial human created by a company called OpenBot.\n\nThe Story:\n"),  # prompt
+        gr.Slider(10, 200, step=10, value=150),  # token_count
+        gr.Slider(0.2, 2.0, step=0.1, value=1.0),  # temperature
+        gr.Slider(0.0, 1.0, step=0.05, value=0.7),  # top_p
+        gr.Slider(0.0, 1.0, step=0.1, value=0.2),  # presencePenalty
+        gr.Slider(0.0, 1.0, step=0.1, value=0.2),  # countPenalty
+    ],
+    outputs=gr.Textbox(label="Generated Output", lines=28),
+    examples=examples,
+    cache_examples=False,
+).queue()
+
+demo = gr.TabbedInterface(
+    [iface], ["Generative"],
+    title=title,
+)
+
+demo.queue(max_size=10)
+demo.launch(share=False, server_name="0.0.0.0")

From e23e98dcd0eb8ebed71a0d8c7c590bcffb946869 Mon Sep 17 00:00:00 2001
From: soulteary <soulteary@gmail.com>
Date: Sat, 25 Mar 2023 16:17:38 +0800
Subject: [PATCH 2/5] chore: use latest pypi package, allow save new format
 model

---
 docker/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index dbad6074..b79f8883 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -12,7 +12,8 @@ EOF
 
 RUN python /get-models.py && \
     rm -rf /get-models.py && \
-    pip install ninja rwkv==0.6.2 gradio
+    pip install ninja gradio && \
+    pip install rwkv==0.7.1 --index-url=https://pypi.org/simple
 
 RUN git clone https://huggingface.co/spaces/BlinkDL/ChatRWKV-gradio/ ChatRWKV && \
     cd ChatRWKV && git checkout 699bb36437cd15ef8da05c4f392f4ab3ea6be65d

From ccebdc2c4eb83399b33c957c15dc2749aa9ab569 Mon Sep 17 00:00:00 2001
From: soulteary <soulteary@gmail.com>
Date: Sat, 25 Mar 2023 16:23:47 +0800
Subject: [PATCH 3/5] feat: model convert to imporve app speed (wip)

---
 docker/convert.py | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 docker/convert.py

diff --git a/docker/convert.py b/docker/convert.py
new file mode 100644
index 00000000..2e5c88bd
--- /dev/null
+++ b/docker/convert.py
@@ -0,0 +1,6 @@
+from huggingface_hub import hf_hub_download
+title = "RWKV-4-Pile-14B-20230313-ctx8192-test1050"
+model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-14b", filename=f"{title}.pth")
+
+from rwkv.model import RWKV
+RWKV(model=model_path, strategy='cuda fp16i8 *20 -> cuda fp16', convert_and_save_and_exit = f"./models/{title}.pth")
\ No newline at end of file

From 22e3396ab5cf57db1c42338a03b81fc44724dc7f Mon Sep 17 00:00:00 2001
From: soulteary <soulteary@gmail.com>
Date: Sat, 25 Mar 2023 16:26:54 +0800
Subject: [PATCH 4/5] chore: use pre-converted model to imporve app speed

---
 docker/webui.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/docker/webui.py b/docker/webui.py
index ef2bc4ac..f198fff3 100644
--- a/docker/webui.py
+++ b/docker/webui.py
@@ -2,7 +2,6 @@
 import gradio as gr
 import os, gc, torch
 from datetime import datetime
-from huggingface_hub import hf_hub_download
 from pynvml import *
 nvmlInit()
 gpu_h = nvmlDeviceGetHandleByIndex(0)
@@ -18,7 +17,14 @@
 os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
 
 from rwkv.model import RWKV
-model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-14b", filename=f"{title}.pth")
+
+model_path = f"./models/{title}.pth"
+if os.path.isfile(model_path):
+    print(f"The pre-converted model exists.")
+else:
+    from huggingface_hub import hf_hub_download
+    model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-14b", filename=f"{title}.pth")
+
 model = RWKV(model=model_path, strategy='cuda fp16i8 *20 -> cuda fp16')
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
 pipeline = PIPELINE(model, "20B_tokenizer.json")

From 09fe10892a392bb7242f8bfe72ff6f14ca22646f Mon Sep 17 00:00:00 2001
From: soulteary <soulteary@gmail.com>
Date: Sat, 25 Mar 2023 17:15:21 +0800
Subject: [PATCH 5/5] feat: add minimum resource requirements mode

---
 docker/Dockerfile      |   1 +
 docker/convert.mini.py |   6 ++
 docker/webui.mini.py   | 134 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 141 insertions(+)
 create mode 100644 docker/convert.mini.py
 create mode 100644 docker/webui.mini.py

diff --git a/docker/Dockerfile b/docker/Dockerfile
index b79f8883..f438a228 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,4 +20,5 @@ RUN git clone https://huggingface.co/spaces/BlinkDL/ChatRWKV-gradio/ ChatRWKV &&
 WORKDIR /app/ChatRWKV
 
 COPY docker/webui.py /app/ChatRWKV/app.py
+COPY docker/*.mini.py /app/ChatRWKV/
 CMD ["python", "/app/ChatRWKV/app.py"]
\ No newline at end of file
diff --git a/docker/convert.mini.py b/docker/convert.mini.py
new file mode 100644
index 00000000..a89b22b4
--- /dev/null
+++ b/docker/convert.mini.py
@@ -0,0 +1,6 @@
+from huggingface_hub import hf_hub_download
+title = "RWKV-4-Pile-14B-20230313-ctx8192-test1050"
+model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-14b", filename=f"{title}.pth")
+
+from rwkv.model import RWKV
+RWKV(model=model_path, strategy='cuda fp16i8 *0+ -> cpu fp32 *1', convert_and_save_and_exit = f"./models/{title}.pth")
diff --git a/docker/webui.mini.py b/docker/webui.mini.py
new file mode 100644
index 00000000..31396221
--- /dev/null
+++ b/docker/webui.mini.py
@@ -0,0 +1,134 @@
+# modify https://huggingface.co/spaces/BlinkDL/ChatRWKV-gradio/blob/main/app.py
+import gradio as gr
+import os, gc, torch
+from datetime import datetime
+from pynvml import *
+nvmlInit()
+gpu_h = nvmlDeviceGetHandleByIndex(0)
+ctx_limit = 1024
+title = "RWKV-4-Pile-14B-20230313-ctx8192-test1050"
+desc = f'''Links:
+<a href='https://github.com/BlinkDL/ChatRWKV' target="_blank" style="margin:0 0.5em">ChatRWKV</a>
+<a href='https://github.com/BlinkDL/RWKV-LM' target="_blank" style="margin:0 0.5em">RWKV-LM</a>
+<a href="https://pypi.org/project/rwkv/" target="_blank" style="margin:0 0.5em">RWKV pip package</a>
+'''
+
+os.environ["RWKV_JIT_ON"] = '1'
+os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
+
+from rwkv.model import RWKV
+
+model_path = f"./models/{title}.pth"
+if os.path.isfile(model_path):
+    print(f"The pre-converted model exists.")
+else:
+    from huggingface_hub import hf_hub_download
+    model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-14b", filename=f"{title}.pth")
+
+model = RWKV(model=model_path, strategy='cuda fp16i8 *0+ -> cpu fp32 *1')
+from rwkv.utils import PIPELINE, PIPELINE_ARGS
+pipeline = PIPELINE(model, "20B_tokenizer.json")
+
+def infer(
+        ctx,
+        token_count=10,
+        temperature=1.0,
+        top_p=0.8,
+        presencePenalty = 0.1,
+        countPenalty = 0.1,
+):
+    args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
+                     alpha_frequency = countPenalty,
+                     alpha_presence = presencePenalty,
+                     token_ban = [0], # ban the generation of some tokens
+                     token_stop = []) # stop generation whenever you see any token here
+
+    ctx = ctx.strip(' ')
+    if ctx.endswith('\n'):
+        ctx = f'\n{ctx.strip()}\n'
+    else:
+        ctx = f'\n{ctx.strip()}'
+
+    gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
+    print(f'vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')
+    
+    all_tokens = []
+    out_last = 0
+    out_str = ''
+    occurrence = {}
+    state = None
+    for i in range(int(token_count)):
+        out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
+        for n in args.token_ban:
+            out[n] = -float('inf')
+        for n in occurrence:
+            out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
+
+        token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
+        if token in args.token_stop:
+            break
+        all_tokens += [token]
+        if token not in occurrence:
+            occurrence[token] = 1
+        else:
+            occurrence[token] += 1
+        
+        tmp = pipeline.decode(all_tokens[out_last:])
+        if '\ufffd' not in tmp:
+            out_str += tmp
+            yield out_str.strip()
+            out_last = i + 1
+    gc.collect()
+    torch.cuda.empty_cache()
+    yield out_str.strip()
+
+examples = [
+    ["Expert Questions & Helpful Answers\nAsk Research Experts\nQuestion:\nHow can we eliminate poverty?\n\nFull Answer:\n", 150, 1.0, 0.7, 0.2, 0.2],
+    ["Here's a short cyberpunk sci-fi adventure story. The story's main character is an artificial human created by a company called OpenBot.\n\nThe Story:\n", 150, 1.0, 0.7, 0.2, 0.2],
+    ['''Below is an instruction that describes a task. Write a response that appropriately completes the request.
+### Instruction:
+Generate a list of adjectives that describe a person as brave.
+### Response:
+''', 150, 1.0, 0.2, 0.5, 0.5],
+    ['''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+### Instruction:
+Arrange the given numbers in ascending order.
+### Input:
+2, 4, 0, 8, 3
+### Response:
+''', 150, 1.0, 0.2, 0.5, 0.5],
+    ["Ask Expert\n\nQuestion:\nWhat are some good plans for world peace?\n\nExpert Full Answer:\n", 150, 1.0, 0.7, 0.2, 0.2],
+    ["Q & A\n\nQuestion:\nWhy is the sky blue?\n\nDetailed Expert Answer:\n", 150, 1.0, 0.7, 0.2, 0.2],
+    ["Dear sir,\nI would like to express my boundless apologies for the recent nuclear war.", 150, 1.0, 0.7, 0.2, 0.2],
+    ["Here is a shell script to find all .hpp files in /home/workspace and delete the 3th row string of these files:", 150, 1.0, 0.7, 0.1, 0.1],
+    ["Building a website can be done in 10 simple steps:\n1.", 150, 1.0, 0.7, 0.2, 0.2],
+    ["A Chinese phrase is provided: 百闻不如一见。\nThe masterful Chinese translator flawlessly translates the phrase into English:", 150, 1.0, 0.5, 0.2, 0.2],
+    ["I believe the meaning of life is", 150, 1.0, 0.7, 0.2, 0.2],
+    ["Simply put, the theory of relativity states that", 150, 1.0, 0.5, 0.2, 0.2],
+]
+
+
+iface = gr.Interface(
+    fn=infer,
+    description=f'''{desc} *** <b>Please try examples first (bottom of page)</b> *** (edit them to use your question). Demo limited to ctxlen {ctx_limit}.''',
+    allow_flagging="never",
+    inputs=[
+        gr.Textbox(lines=10, label="Prompt", value="Here's a short cyberpunk sci-fi adventure story. The story's main character is an artificial human created by a company called OpenBot.\n\nThe Story:\n"),  # prompt
+        gr.Slider(10, 200, step=10, value=150),  # token_count
+        gr.Slider(0.2, 2.0, step=0.1, value=1.0),  # temperature
+        gr.Slider(0.0, 1.0, step=0.05, value=0.7),  # top_p
+        gr.Slider(0.0, 1.0, step=0.1, value=0.2),  # presencePenalty
+        gr.Slider(0.0, 1.0, step=0.1, value=0.2),  # countPenalty
+    ],
+    outputs=gr.Textbox(label="Generated Output", lines=28),
+    examples=examples,
+    cache_examples=False,
+).queue()
+
+demo = gr.TabbedInterface(
+    [iface], ["Generative"],
+    title=title,
+)
+
+demo.queue(max_size=10)
+demo.launch(share=False, server_name="0.0.0.0")