|
44 | 44 | Gemma3TextModel, |
45 | 45 | GemmaTokenizerFast, |
46 | 46 | GenerationConfig, |
| 47 | + RopeParameters, |
47 | 48 | SiglipVisionConfig, |
48 | 49 | ) |
49 | 50 | from transformers.image_utils import PILImageResampling |
|
142 | 143 | max_position_embeddings=1024, |
143 | 144 | query_pre_attn_scalar=256, |
144 | 145 | sliding_window=512, |
145 | | - rope_parameters=None, |
| 146 | + rope_parameters={ |
| 147 | + "full_attention": RopeParameters(rope_type="default", rope_theta=1_000_000.0), |
| 148 | + "sliding_attention": RopeParameters(rope_type="default", rope_theta=10_000.0), |
| 149 | + }, |
146 | 150 | use_bidirectional_attention=True, |
147 | 151 | ), |
148 | 152 | vision_config=None, |
|
159 | 163 | max_position_embeddings=32768, |
160 | 164 | query_pre_attn_scalar=256, |
161 | 165 | sliding_window=512, |
162 | | - rope_parameters=None, |
| 166 | + rope_parameters={ |
| 167 | + "full_attention": RopeParameters(rope_type="default", rope_theta=1_000_000.0), |
| 168 | + "sliding_attention": RopeParameters(rope_type="default", rope_theta=10_000.0), |
| 169 | + }, |
163 | 170 | ), |
164 | 171 | vision_config=None, |
165 | 172 | ), |
|
173 | 180 | num_key_value_heads=1, |
174 | 181 | head_dim=256, |
175 | 182 | sliding_window=512, |
176 | | - rope_theta=1_000_000, # used for global RoPE only |
177 | | - rope_local_base_freq=10_000, |
| 183 | + rope_parameters={ |
| 184 | + "full_attention": RopeParameters(rope_type="default", rope_theta=1_000_000.0), |
| 185 | + "sliding_attention": RopeParameters(rope_type="default", rope_theta=10_000.0), |
| 186 | + }, |
178 | 187 | attn_logit_softcapping=None, |
179 | 188 | query_pre_attn_scalar=256, |
180 | 189 | max_position_embeddings=32_768, |
|
192 | 201 | num_key_value_heads=4, |
193 | 202 | sliding_window=1024, |
194 | 203 | rope_parameters={ |
195 | | - "full_attention": {"rope_type": "linear", "factor": 8.0}, |
196 | | - "sliding_attention": {"rope_type": "default"}, |
| 204 | + "full_attention": RopeParameters(rope_type="linear", rope_theta=1_000_000.0, factor=8.0), |
| 205 | + "sliding_attention": RopeParameters(rope_type="default", rope_theta=10_000.0), |
197 | 206 | }, |
198 | | - rope_theta=1_000_000, |
199 | | - rope_local_base_freq=10_000, |
200 | 207 | attn_logit_softcapping=None, |
201 | 208 | query_pre_attn_scalar=256, |
202 | 209 | ), |
|
213 | 220 | num_key_value_heads=8, |
214 | 221 | sliding_window=1024, |
215 | 222 | rope_parameters={ |
216 | | - "full_attention": {"rope_type": "linear", "factor": 8.0}, |
217 | | - "sliding_attention": {"rope_type": "default"}, |
| 223 | + "full_attention": RopeParameters(rope_type="linear", rope_theta=1_000_000.0, factor=8.0), |
| 224 | + "sliding_attention": RopeParameters(rope_type="default", rope_theta=10_000.0), |
218 | 225 | }, |
219 | | - rope_theta=1_000_000, |
220 | | - rope_local_base_freq=10_000, |
221 | 226 | attn_logit_softcapping=None, |
222 | 227 | query_pre_attn_scalar=256, |
223 | 228 | ), |
|
234 | 239 | head_dim=128, |
235 | 240 | sliding_window=1024, |
236 | 241 | rope_parameters={ |
237 | | - "full_attention": {"rope_type": "linear", "factor": 8.0}, |
238 | | - "sliding_attention": {"rope_type": "default"}, |
| 242 | + "full_attention": RopeParameters(rope_type="linear", rope_theta=1_000_000.0, factor=8.0), |
| 243 | + "sliding_attention": RopeParameters(rope_type="default", rope_theta=10_000.0), |
239 | 244 | }, |
240 | | - rope_theta=1_000_000, |
241 | | - rope_local_base_freq=10_000, |
242 | 245 | attn_logit_softcapping=None, |
243 | 246 | query_pre_attn_scalar=(42 * 128 // 32), # 1 / sqrt(hidden_size // num_attention_heads) |
244 | 247 | ), |
|
0 commit comments