fix qwen2_5_vl

HarmonyHu · HarmonyHu · commit c96f5be5471d · 2025-02-27T17:31:37.000+08:00
diff --git a/models/Qwen2_5_VL/README.md b/models/Qwen2_5_VL/README.md
@@ -37,7 +37,7 @@ python3 export_onnx.py --model_path /path/to/Qwen2_5-VL-3B-Instruct --seq_length
 此处介绍如何将onnx模型编译成bmodel。也可以省去编译模型这一步，直接下载编译好的模型：
 
 ``` shell
-python3 -m dfss --url=open@sophgo.com:/ext_model_information/LLM/LLM-TPU/qwen2-vl-2b_int4_seq2048_1dev.bmodel
+python3 -m dfss --url=open@sophgo.com:/ext_model_information/LLM/LLM-TPU/qwen2.5-vl-3b_w4bf16_seq8192.bmodel
 ```
 
 #### 1. 下载docker，启动容器
diff --git a/models/Qwen2_5_VL/compile/compile.sh b/models/Qwen2_5_VL/compile/compile.sh
@@ -188,8 +188,6 @@ vision_transformer() {
     model_deploy.py \
         --mlir vit.mlir \
         ${half_quantize_args} \
-        --quant_input \
-        --quant_input_list 3 \
         --quant_output \
         --high_precision \
         --chip bm1684x \
diff --git a/models/Qwen2_5_VL/compile/export_onnx.py b/models/Qwen2_5_VL/compile/export_onnx.py
@@ -428,7 +428,8 @@ def test_net_with_mask(mode, messages):
         padding=True,
         return_tensors="pt",
     )
-
+    del image_inputs, video_inputs
+    torch.cuda.empty_cache()
     input_ids = inputs.input_ids
     if mode == "image":
         pixel_values = inputs.pixel_values
@@ -445,7 +446,8 @@ def test_net_with_mask(mode, messages):
 
     # vit
     vit_embeds = vit_launch(pixel_values, grid_thw)
-
+    del pixel_values
+    torch.cuda.empty_cache()
     # embedding
     input_ids_prefill = torch.zeros(1, SEQ_LENGTH).to(torch.int32).to(device)
     input_ids_prefill[:, :input_ids.shape[-1]] = input_ids
@@ -482,6 +484,8 @@ def test_net_with_mask(mode, messages):
         v[:, input_ids.shape[-1]:, :, :] = 0
         k_cache.append(k)
         v_cache.append(v)
+        del k, v
+        torch.cuda.empty_cache()
     inputs_embeds = inputs_embeds[:, input_ids.shape[-1] - 1:input_ids.shape[-1]].view(1, 1, HIDDEN_SIZE)
     lm = LmHead()
 
@@ -494,6 +498,7 @@ def test_net_with_mask(mode, messages):
         token_len += 1
         input_id = torch.tensor([token]).to(device)
         out = embed(input_id).view(1, 1, HIDDEN_SIZE)
+        del input_id
         valid_position_ids += 1
         position_ids = torch.tensor(3*[[[valid_position_ids]]]).to(device)
 
@@ -506,6 +511,8 @@ def test_net_with_mask(mode, messages):
                                      k_cache[i].to(dtype), v_cache[i].to(dtype))
             k_cache[i][:, token_len-1:token_len, :, :] = k[:, :, :, :]
             v_cache[i][:, token_len-1:token_len, :, :] = v[:, :, :, :]
+            del k, v
+            torch.cuda.empty_cache()
         token = greedy(lm(out.to(dtype))).view(1)
         out_ids.append(int(token))
     words = tokenizer.decode(out_ids)
@@ -592,7 +599,7 @@ def test_image(path, resized_height, resized_width):
     print("\033[31m如果输入为图片时，注意resized_height与resized_width，避免resize导致图片质量损失 \033[0m")
 
 
-    # test_image(path = "./../python_demo/image1.jpg", resized_height=280, resized_width=420)
+    # test_image(path = "./../python_demo/test.jpg", resized_height=280, resized_width=420)
     # test_video(path = "./sample.mp4")
 
     # convert
diff --git a/models/Qwen2_5_VL/python_demo/README.md b/models/Qwen2_5_VL/python_demo/README.md
@@ -16,4 +16,14 @@ cd build
 cmake ..
 make
 mv chat*.so ../
+```
+
+## 下载模型
+```shell
+python3 -m dfss --url=open@sophgo.com:/ext_model_information/LLM/LLM-TPU/qwen2.5-vl-3b_w4bf16_seq8192.bmodel
+```
+
+## 运行
+``` shell
+python3 pipeline.py --model_path ../qwen2.5-vl-3b_w4bf16_seq8192_1dev_20250226_104241.bmodel --config_path config/
 ```
diff --git a/models/Qwen2_5_VL/python_demo/chat.cpp b/models/Qwen2_5_VL/python_demo/chat.cpp
@@ -40,8 +40,10 @@ class Qwen2VL {
   Qwen2VL() : sgen(std::random_device()()) {};
 
 private:
-  std::vector<int> make_vit_posid(std::vector<int> &grid_thw);
-  std::vector<uint16_t> make_vit_attn_mask(std::vector<int> &grid_thw);
+  void make_vit_posid(std::vector<int> &grid_thw,
+                      std::vector<int> &position_ids);
+  void make_vit_attn_mask(std::vector<int> &grid_thw,
+                          std::vector<float> &attention_mask);
   std::vector<int> make_posid(const std::vector<int> &grid_thw, int vit_offset,
                               int valid_vit_length, int token_length);
 
@@ -186,7 +188,8 @@ void Qwen2VL::head_launch(const bm_net_info_t *net,
   bm_thread_sync(bm_handle);
 }
 
-std::vector<int> Qwen2VL::make_vit_posid(std::vector<int> &grid_thw) {
+void Qwen2VL::make_vit_posid(std::vector<int> &grid_thw,
+                             std::vector<int> &pos_ids) {
   int t = grid_thw[0];
   int h = grid_thw[1];
   int w = grid_thw[2];
@@ -214,18 +217,16 @@ std::vector<int> Qwen2VL::make_vit_posid(std::vector<int> &grid_thw) {
   }
 
   int valid_vit_pixels = h * w;
-  std::vector<int> pos_ids(MAX_PIXELS * 2, 0);
   for (int i = 0; i < t; ++i) {
     for (int j = 0; j < valid_vit_pixels; ++j) {
       pos_ids[i * valid_vit_pixels + 2 * j] = hpos_ids[j];
       pos_ids[i * valid_vit_pixels + 2 * j + 1] = wpos_ids[j];
     }
   }
-
-  return pos_ids;
 }
 
-std::vector<uint16_t> Qwen2VL::make_vit_attn_mask(std::vector<int> &grid_thw) {
+void Qwen2VL::make_vit_attn_mask(std::vector<int> &grid_thw,
+                                 std::vector<float> &attention_mask) {
   // Extract t, h, w from grid_thw
   int t = grid_thw[0];
   int h = grid_thw[1];
@@ -237,9 +238,6 @@ std::vector<uint16_t> Qwen2VL::make_vit_attn_mask(std::vector<int> &grid_thw) {
     cu_seqlens[i] = h * w * i;
   }
 
-  // Initialize attention_mask with -10000
-  std::vector<uint16_t> attention_mask(MAX_PIXELS * MAX_PIXELS, ATTENTION_MASK);
-
   // Update attention_mask based on cu_seqlens
   for (size_t i = 1; i < cu_seqlens.size(); ++i) {
     int start = cu_seqlens[i - 1];
@@ -253,8 +251,6 @@ std::vector<uint16_t> Qwen2VL::make_vit_attn_mask(std::vector<int> &grid_thw) {
       }
     }
   }
-
-  return attention_mask;
 }
 
 void Qwen2VL::vit_launch(std::vector<float> &pixel_values, int vit_offset,
@@ -264,8 +260,11 @@ void Qwen2VL::vit_launch(std::vector<float> &pixel_values, int vit_offset,
   out_mem = dev_buffer;
   // forward vision transformer
   std::vector<float> pixel_values_pad(MAX_PIXELS * VIT_DIMS, 0);
-  auto position_ids = make_vit_posid(grid_thw);
-  auto attention_mask = make_vit_attn_mask(grid_thw);
+  // Initialize attention_mask with -10000
+  std::vector<float> attention_mask(MAX_PIXELS * MAX_PIXELS, -10000.0f);
+  std::vector<int> position_ids(MAX_PIXELS * 2, 0);
+  make_vit_posid(grid_thw, position_ids);
+  make_vit_attn_mask(grid_thw, attention_mask);
   std::copy(pixel_values.begin(), pixel_values.end(), pixel_values_pad.data());
 
   empty_net(bm_handle, net_vit);
@@ -387,20 +386,18 @@ int Qwen2VL::forward_first(std::vector<int> &tokens,
                            std::vector<int> &grid_thw, int vit_offset,
                            int valid_vit_length) {
   std::vector<int> input_ids(SEQLEN, 0);
-  std::vector<uint16_t> attention_mask(SEQLEN * SEQLEN, 0);
+  std::vector<uint16_t> attention_mask(SEQLEN * SEQLEN, ATTENTION_MASK);
   std::copy(tokens.begin(), tokens.end(), input_ids.data());
 
   token_length = tokens.size(); // text input length
 
   auto position_ids =
       make_posid(grid_thw, vit_offset, valid_vit_length, token_length);
   for (int i = 0; i < token_length; i++) {
-    for (int j = 0; j < SEQLEN; j++) {
+    for (int j = 0; j < token_length; j++) {
       if (j <= i) {
         attention_mask[i * SEQLEN + j] = 0;
-      } else {
-        attention_mask[i * SEQLEN + j] = ATTENTION_MASK;
-      }
+      } 
     }
   }
 
diff --git a/models/Qwen2_5_VL/python_demo/pipeline.py b/models/Qwen2_5_VL/python_demo/pipeline.py
@@ -45,39 +45,23 @@ def text_message(self):
         return messages
 
     def image_message(self, path):
-        if self.resized_height != None and self.resized_width != None:
-            print("\033[31m如果输入为图片时，注意resized_height与resized_width与export_onnx.py时的保持一致\033[0m")
-            messages = [{
-                "role":
-                "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "image": path,
-                        "resized_height": self.resized_height,
-                        "resized_width": self.resized_width,
-                    },
-                    {
-                        "type": "text",
-                        "text": self.input_str
-                    },
-                ],
-            }]
-        else:
-            messages = [{
-                "role":
-                "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "image": path,
-                    },
-                    {
-                        "type": "text",
-                        "text": self.input_str
-                    },
-                ],
-            }]
+        print("\033[31m如果输入为图片时，注意resized_height与resized_width与export_onnx.py时的保持一致\033[0m")
+        messages = [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": path,
+                    "resized_height": self.resized_height,
+                    "resized_width": self.resized_width,
+                },
+                {
+                    "type": "text",
+                    "text": self.input_str
+                },
+            ],
+        }]
         return messages
 
     def video_message(self, path):
@@ -151,18 +135,14 @@ def chat(self):
                 break
 
             media_path = input("\nImage or Video Path: ")
-            if media_path == "":
-                media_type = "text"
-                messages = self.text_message()
+            if not os.path.exists(media_path):
+                print("Can't find image or video: {}".format(media_path))
+                continue
+            media_type = self.get_media_type(media_path)
+            if media_type == "image":
+                messages = self.image_message(media_path)
             else:
-                if not os.path.exists(media_path):
-                    print("Can't find image or video: {}".format(media_path))
-                    continue
-                media_type = self.get_media_type(media_path)
-                if media_type == "image":
-                    messages = self.image_message(media_path)
-                else:
-                    messages = self.video_message(media_path)
+                messages = self.video_message(media_path)
             inputs = self.process(messages)
             print("\nAnswer:")
 
diff --git a/support/include/utils.h b/support/include/utils.h
@@ -542,7 +542,7 @@ void compare_out_net(
 // Dump to file
 //===------------------------------------------------------------===//
 void dump_tensor_to_file(bm_handle_t &bm_handle, bm_tensor_t &t,
-                         bm_shape_t bm_shape, const std::string &filename,
+                         bm_shape_t bm_shape, cnpy::npz_t &npz_map,
                          bm_data_type_t tensor_type,
                          const std::string &tensor_name) {
   int mem_size = bm_mem_get_device_size(t.device_mem);
@@ -556,7 +556,7 @@ void dump_tensor_to_file(bm_handle_t &bm_handle, bm_tensor_t &t,
     for (size_t i = 0; i < data.size(); i++) {
       data[i] = fp16_ieee_to_fp32_value(buffer[i]);
     }
-    cnpy::npz_save(filename, tensor_name, data.data(), shape, "a");
+    cnpy::npz_add_array(npz_map, tensor_name, data.data(), shape);
   } else if (tensor_type == BM_BFLOAT16) {
     // BF16
     int cnt = mem_size / sizeof(uint16_t);
@@ -566,19 +566,19 @@ void dump_tensor_to_file(bm_handle_t &bm_handle, bm_tensor_t &t,
     for (size_t i = 0; i < data.size(); i++) {
       data[i] = bf16_to_fp32_value(buffer[i]);
     }
-    cnpy::npz_save(filename, tensor_name, data.data(), shape, "a");
+    cnpy::npz_add_array(npz_map, tensor_name, data.data(), shape);
   } else if (tensor_type == BM_INT32) {
     // INT32
     int cnt = mem_size / sizeof(int32_t);
     std::vector<int> data(cnt);
     bm_memcpy_d2s(bm_handle, data.data(), t.device_mem);
-    cnpy::npz_save(filename, tensor_name, data.data(), shape, "a");
+    cnpy::npz_add_array(npz_map, tensor_name, data.data(), shape);
   } else if (tensor_type == BM_FLOAT32) {
     // FLOAT32
     int cnt = mem_size / sizeof(float);
     std::vector<float> data(cnt);
     bm_memcpy_d2s(bm_handle, data.data(), t.device_mem);
-    cnpy::npz_save(filename, tensor_name, data.data(), shape, "a");
+    cnpy::npz_add_array(npz_map, tensor_name, data.data(), shape);
   } else {
     throw std::runtime_error("Not support dtype");
   }
@@ -587,31 +587,33 @@ void dump_tensor_to_file(bm_handle_t &bm_handle, bm_tensor_t &t,
 void dump_net_input_to_file(bm_handle_t &bm_handle, const bm_net_info_t *net,
                             const std::string &filename) {
   std::vector<bm_tensor_t> in_tensors(net->input_num);
-
+  cnpy::npz_t npz_map;
   for (int i = 0; i < net->input_num; i++) {
     bmrt_tensor_with_device(&in_tensors[i], net->stages[0].input_mems[i],
                             net->input_dtypes[i],
                             net->stages[0].input_shapes[i]);
 
     dump_tensor_to_file(bm_handle, in_tensors[i],
-                        net->stages[0].input_shapes[i], filename,
+                        net->stages[0].input_shapes[i], npz_map,
                         net->input_dtypes[i], "input_" + std::to_string(i));
   }
+  cnpy::npz_save_all(filename, npz_map);
 }
 
 void dump_net_output_to_file(bm_handle_t &bm_handle, const bm_net_info_t *net,
                              const std::string &filename) {
   std::vector<bm_tensor_t> out_tensors(net->output_num);
-
+  cnpy::npz_t npz_map;
   for (int i = 0; i < net->output_num; i++) {
     bmrt_tensor_with_device(&out_tensors[i], net->stages[0].output_mems[i],
                             net->output_dtypes[i],
                             net->stages[0].output_shapes[i]);
 
     dump_tensor_to_file(bm_handle, out_tensors[i],
-                        net->stages[0].output_shapes[i], filename,
+                        net->stages[0].output_shapes[i], npz_map,
                         net->output_dtypes[i], "output_" + std::to_string(i));
   }
+  cnpy::npz_save_all(filename, npz_map);
 }
 
 void dump_net_to_file(bm_handle_t &bm_handle, const bm_net_info_t *net,

-Original file line number
+Diff line change
 此处介绍如何将onnx模型编译成bmodel。也可以省去编译模型这一步，直接下载编译好的模型：
 ``` shell
 -python3 -m dfss [email protected]:/ext_model_information/LLM/LLM-TPU/qwen2-vl-2b_int4_seq2048_1dev.bmodel
 +python3 -m dfss [email protected]:/ext_model_information/LLM/LLM-TPU/qwen2.5-vl-3b_w4bf16_seq8192.bmodel
 ```
 #### 1. 下载docker，启动容器