1# 安装 PyTorch 版本 2.1.2 2!pip install -q -U "torch==2.1.2" 3# 安装 TensorBoard 4!pip install -q -U tensorboard 5# 安装 transformers 版本 4.36.2 6!pip install -q -U "transformers==4.36.2" 7# 安装 datasets 版本 2.16.1 8!pip install -q -U "datasets==2.16.1" 9# 安装 accelerate 版本 0.26.1 10!pip install -q -U "accelerate==0.26.1" 11# 安装 bitsandbytes 版本 0.42.0 12!pip install -q -U "bitsandbytes==0.42.0" 13# 安装 huggingface/trl 仓库 14!pip install -q -U git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e 15# 安装 huggingface/peft 仓库 16!pip install -q -U git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f
FinancialPhraseBank数据集是由Aalto University School提供的,用于金融新闻情感分类。该数据集包含5000条新闻,由16名行业专家进行了分析,以判断其对股价的影响是正面、负面还是中性。每条数据都由两部分组成:Sentiment和News Headline。Sentiment是根据News Headline的内容,由专家给出的情感判断。
1# 导入pandas库并为其设置别名为pd 2import pandas as pd 3 4# 定义文件名,这里是一个CSV文件的路径 5filename = "../input/sentiment-analysis-for-financial-news/all-data.csv" 6 7# 使用pandas的read_csv函数读取CSV文件,并将其存储在名为df的DataFrame中 8# 为列指定名称,分别为"Sentiment"和"News Headline" 9# 设置文件编码为utf-8,遇到编码错误时替换为"replace" 10df = pd.read_csv(filename, 11 names=["Sentiment", "News Headline"], 12 encoding="utf-8", encoding_errors="replace") 13 14# 设置pandas显示的最大列宽为100 15pd.set_option('max_colwidth', 100) 16 17# 显示DataFrame的前几行数据 18df.head()
Sentiment | News Headline | |
0 | neutral | According to Gran , the company has no plans to move all production to Russia , although that is... |
1 | neutral | Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to... |
2 | negative | The international electronic industry company Elcoteq has laid off tens of employees from its Ta... |
3 | positive | With the new production plant the company would increase its capacity to meet the expected incre... |
4 | positive | According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-te... |
1# 导入所需库 2import os 3os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 设置GPU设备可见性为0 4os.environ["TOKENIZERS_PARALLELISM"] = "false" # 禁用分词器并行化 5import warnings 6warnings.filterwarnings("ignore") # 忽略警告信息 7import numpy as np 8import pandas as pd 9import os 10from tqdm import tqdm 11import bitsandbytes as bnb 12import torch 13import torch.nn as nn 14import transformers 15from datasets import Dataset 16from peft import LoraConfig, PeftConfig 17from trl import SFTTrainer 18from trl import setup_chat_format 19from transformers import (AutoModelForCausalLM, 20 AutoTokenizer, 21 BitsAndBytesConfig, 22 TrainingArguments, 23 pipeline, 24 logging) 25from sklearn.metrics import (accuracy_score, 26 classification_report, 27 confusion_matrix) 28from sklearn.model_selection import train_test_split 29device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # 设置设备为GPU或CPU 30print(f"working on {device}") # 打印当前使用的设备
working on cuda:0
1# 导入所需的库 2import pandas as pd 3from sklearn.model_selection import train_test_split 4from datasets import Dataset 5 6# 读取数据文件 7filename = "../input/sentiment-analysis-for-financial-news/all-data.csv" 8df = pd.read_csv(filename, 9 names=["sentiment", "text"], 10 encoding="utf-8", encoding_errors="replace") 11 12# 初始化训练集和测试集列表 13X_train = list() 14X_test = list() 15 16# 遍历情感类别,将数据分为训练集和测试集 17for sentiment in ["positive", "neutral", "negative"]: 18 train, test = train_test_split(df[df.sentiment == sentiment], 19 train_size=300, 20 test_size=300, 21 random_state=42) 22 X_train.append(train) 23 X_test.append(test) 24 25# 合并训练集和测试集 26X_train = pd.concat(X_train).sample(frac=1, random_state=10) 27X_test = pd.concat(X_test) 28 29# 获取评估集 30eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)] 31X_eval = df[df.index.isin(eval_idx)] 32X_eval = (X_eval 33 .groupby('sentiment', group_keys=False) 34 .apply(lambda x: x.sample(n=50, random_state=10, replace=True))) 35X_train = X_train.reset_index(drop=True) 36 37# 定义生成提示的函数 38def generate_prompt(data_point): 39 return f""" 40 Analyze the sentiment of the news headline enclosed in square brackets, 41 determine if it is positive, neutral, or negative, and return the answer as 42 the corresponding sentiment label "positive" or "neutral" or "negative". 43 44 \[{data_point["text"]}\] = {data_point["sentiment"]} 45 """.strip() 46 47# 定义生成测试提示的函数 48def generate_test_prompt(data_point): 49 return f""" 50 Analyze the sentiment of the news headline enclosed in square brackets, 51 determine if it is positive, neutral, or negative, and return the answer as 52 the corresponding sentiment label "positive" or "neutral" or "negative". 53 54 \[{data_point["text"]}\] = """.strip() 55 56# 为训练集、评估集和测试集生成提示 57X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), columns=["text"]) 58X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1), columns=["text"]) 59y_true = X_test.sentiment 60X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"]) 61 62# 将数据集转换为Hugging Face的Dataset格式 63train_data = Dataset.from_pandas(X_train) 64eval_data = Dataset.from_pandas(X_eval)
1def evaluate(y_true, y_pred): 2 # 定义标签和映射关系 3 labels = ['positive', 'neutral', 'negative'] 4 mapping = {'positive': 2, 'neutral': 1, 'none':1, 'negative': 0} 5 6 # 定义映射函数 7 def map_func(x): 8 return mapping.get(x, 1) 9 10 # 将真实标签和预测标签进行映射 11 y_true = np.vectorize(map_func)(y_true) 12 y_pred = np.vectorize(map_func)(y_pred) 13 14 # 计算准确率 15 accuracy = accuracy_score(y_true=y_true, y_pred=y_pred) 16 print(f'Accuracy: {accuracy:.3f}') 17 18 # 生成准确率报告 19 unique_labels = set(y_true) # 获取唯一标签 20 21 for label in unique_labels: 22 label_indices = [i for i in range(len(y_true)) if y_true[i] == label] 23 label_y_true = [y_true[i] for i in label_indices] 24 label_y_pred = [y_pred[i] for i in label_indices] 25 accuracy = accuracy_score(label_y_true, label_y_pred) 26 print(f'Accuracy for label {label}: {accuracy:.3f}') 27 28 # 生成分类报告 29 class_report = classification_report(y_true=y_true, y_pred=y_pred) 30 print('\nClassification Report:') 31 print(class_report) 32 33 # 生成混淆矩阵 34 conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2]) 35 print('\nConfusion Matrix:') 36 print(conf_matrix)
首先,我们将对Llama 7b模型进行评估,以了解其在特定任务上的性能。为了节省显存资源,我们将使用bitsandbytes工具对模型进行量化处理。在实验过程中,我们选择较低的temperature值以降低生成结果的随机性。此外,我们还通过识别关键词来判断模型对新闻的情感分类。例如,如果输出中包含“positive”,则我们认为结果为“positive”。
1# 导入所需的库和模块 2import torch 3from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline 4from tqdm import tqdm 5 6# 设置模型名称和计算类型 7model_name = "/kaggle/input/llama2-7b-hf/Llama2-7b-hf" 8compute_dtype = getattr(torch, "float16") 9 10# 设置BitsAndBytes配置 11bnb_config = BitsAndBytesConfig( 12 load_in_4bit=True, 13 bnb_4bit_quant_type="nf4", 14 bnb_4bit_compute_dtype=compute_dtype, 15 bnb_4bit_use_double_quant=True, 16) 17 18# 从预训练模型中加载模型 19model = AutoModelForCausalLM.from_pretrained( 20 model_name, 21 device_map=device, 22 torch_dtype=compute_dtype, 23 quantization_config=bnb_config, 24) 25 26# 禁用缓存并设置预训练任务类型 27model.config.use_cache = False 28model.config.pretraining_tp = 1 29 30# 从预训练模型中加载分词器 31tokenizer = AutoTokenizer.from_pretrained(model_name, 32 trust_remote_code=True, 33 ) 34tokenizer.pad_token = tokenizer.eos_token 35tokenizer.padding_side = "right" 36 37# 设置聊天格式 38model, tokenizer = setup_chat_format(model, tokenizer) 39 40# 定义预测函数 41def predict(test, model, tokenizer): 42 y_pred = [] # 初始化预测结果列表 43 for i in tqdm(range(len(X_test))): # 遍历测试数据 44 prompt = X_test.iloc[i]["text"] # 获取当前样本的文本 45 pipe = pipeline(task="text-generation", # 创建文本生成管道 46 model=model, 47 tokenizer=tokenizer, 48 max_new_tokens=1, 49 temperature=0.0, 50 ) 51 result = pipe(prompt) # 使用管道进行文本生成 52 answer = result[0]['generated_text'].split("=")[-1] # 提取生成的答案 53 if "positive" in answer: # 根据答案判断情感倾向 54 y_pred.append("positive") 55 elif "negative" in answer: 56 y_pred.append("negative") 57 elif "neutral" in answer: 58 y_pred.append("neutral") 59 else: 60 y_pred.append("none") 61 return y_pred # 返回预测结果列表 62 63# 调用预测函数进行预测 64y_pred = predict(test, model, tokenizer)
100%|██████████| 900/900 [05:28<00:00, 2.74it/s]
1evaluate(y_true, y_pred) 2Accuracy: 0.373 3Accuracy for label 0: 0.027 4Accuracy for label 1: 0.937 5Accuracy for label 2: 0.157 6 7Classification Report: 8 precision recall f1-score support 9 10 0 0.89 0.03 0.05 300 11 1 0.34 0.94 0.50 300 12 2 0.67 0.16 0.25 300 13 14 accuracy 0.37 900 15 macro avg 0.63 0.37 0.27 900 16weighted avg 0.63 0.37 0.27 900 17 18Confusion Matrix: 19[[ 8 287 5] 20 [ 1 281 18] 21 [ 0 253 47]]
- LoRA参数:
- lora_alpha:新增的LoRA参数的缩放系数。
- lora_dropout:新增的LoRA参数的dropout概率。
- r:LoRA矩阵的秩。
- 训练参数:
- Learning_rate:学习率,由于LoRA是新增的参数,通常需要设置较大的学习率。
- gradient_accumulation_steps:由于P100在训练时仅能处理一条数据,因此需要设置较大的累计值。
- num_train_epochs:训练的epochs,由于时间限制,本次仅进行了1个epoch的训练。在大模型LoRA或全参微调时,一般建议进行2个epoch的训练,此时valid loss最低;进行3个epoch的训练,模型可能会略微过拟合,但在评测数据上表现最优。
1# 设置训练权重的输出目录 2output_dir = "trained_weights" 3 4# 配置Lora(局部线性自适应)参数 5peft_config = LoraConfig( 6 lora_alpha=16, # Lora的alpha值,用于控制模型的大小 7 lora_dropout=0.1, # Lora的dropout率,用于防止过拟合 8 r=64, # Lora的rank值,用于控制模型的复杂度 9 bias="none", # 不使用偏置项 10 target_modules="all-linear", # 应用Lora的目标模块,这里设置为所有线性层 11 task_type="CAUSAL_LM", # 任务类型,这里设置为因果语言模型 12) 13 14# 设置训练参数 15training_arguments = TrainingArguments( 16 output_dir=output_dir, # 训练权重的输出目录 17 num_train_epochs=1, # 训练轮数 18 per_device_train_batch_size=1, # 每个设备的训练批次大小 19 gradient_accumulation_steps=8, # 梯度累积步数 20 gradient_checkpointing=True, # 使用梯度检查点来节省内存 21 optim="paged_adamw_32bit", # 优化器 22 save_steps=0, # 保存步骤 23 logging_steps=25, # 日志记录步数 24 learning_rate=2e-4, # 学习率 25 weight_decay=0.001, # 权重衰减 26 fp16=True, # 使用半精度浮点数进行训练 27 bf16=False, # 不使用混合精度浮点数进行训练 28 max_grad_norm=0.3, # 最大梯度范数 29 max_steps=-1, # 最大步数 30 warmup_ratio=0.03, # 预热比例 31 group_by_length=True, # 根据序列长度分组 32 lr_scheduler_type="cosine", # 使用余弦学习率调度器 33 report_to="tensorboard", # 将指标报告给TensorBoard 34 evaluation_strategy="epoch" # 每轮结束后保存检查点 35) 36 37# 创建SFTTrainer对象,用于训练和评估模型 38trainer = SFTTrainer( 39 model=model, # 模型 40 args=training_arguments, # 训练参数 41 train_dataset=train_data, # 训练数据集 42 eval_dataset=eval_data, # 评估数据集 43 peft_config=peft_config, # Lora配置 44 dataset_text_field="text", # 数据集中的文本字段 45 tokenizer=tokenizer, # 分词器 46 max_seq_length=1024, # 最大序列长度 47 packing=False, # 不使用packing 48 dataset_kwargs={ 49 "add_special_tokens": False, # 不添加特殊标记 50 "append_concat_token": False, # 不追加连接标记 51 } 52) 53 54# 训练模型 55trainer.train() 56 57# 保存训练好的模型和分词器 58trainer.save_model() 59tokenizer.save_pretrained(output_dir)
1# 清理内存开销 2import gc 3 4# 删除不再使用的变量 5del [model, tokenizer, peft_config, trainer, train_data, eval_data, bnb_config, training_arguments] 6del [df, X_train, X_eval] 7del [TrainingArguments, SFTTrainer, LoraConfig, BitsAndBytesConfig] 8 9# 清空GPU缓存并强制进行垃圾回收 10for _ in range(100): 11 torch.cuda.empty_cache() 12 gc.collect()
1!nvidia-smi 2Mon Apr 15 08:48:41 2024 3+---------------------------------------------------------------------------------------+ 4| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 | 5|-----------------------------------------+----------------------+----------------------+ 6| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | 7| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | 8| | | MIG M. | 9|=========================================+======================+======================| 10| 0 Tesla P100-PCIE-16GB Off | 00000000:00:04.0 Off | 0 | 11| N/A 57C P0 44W / 250W | 1926MiB / 16384MiB | 0% Default | 12| | | N/A | 13+-----------------------------------------+----------------------+----------------------+ 14 15+---------------------------------------------------------------------------------------+ 16| Processes: | 17| GPU GI CI PID Type Process name GPU Memory | 18| ID ID Usage | 19|=======================================================================================| 20+---------------------------------------------------------------------------------------+ 21# 导入所需的库和模块 22from peft import AutoPeftModelForCausalLM 23import torch 24from transformers import AutoTokenizer 25 26# 设置模型的路径和计算类型 27finetuned_model = "./trained_weigths/" 28compute_dtype = getattr(torch, "float16") 29 30# 加载预训练的分词器 31tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/llama2-7b-hf/Llama2-7b-hf") 32 33# 从预训练的模型中创建AutoPeftModelForCausalLM实例 34model = AutoPeftModelForCausalLM.from_pretrained( 35 finetuned_model, 36 torch_dtype=compute_dtype, 37 return_dict=False, 38 low_cpu_mem_usage=True, 39 device_map=device, 40) 41 42# 合并模型并卸载原始模型 43merged_model = model.merge_and_unload() 44 45# 保存合并后的模型和分词器到指定路径 46merged_model.save_pretrained("./merged_model",safe_serialization=True, max_shard_size="2GB") 47tokenizer.save_pretrained("./merged_model")
1# 使用预测函数对测试数据进行预测,得到预测结果 y_pred 2y_pred = predict(test, merged_model, tokenizer) 3# 使用评估函数对真实标签 y_true 和预测结果 y_pred 进行评估 4evaluate(y_true, y_pred) 5100%|██████████| 900/900 [03:49<00:00, 3.93it/s] 6 7Accuracy: 0.817 8Accuracy for label 0: 0.967 9Accuracy for label 1: 0.647 10Accuracy for label 2: 0.837 11 12Classification Report: 13 precision recall f1-score support 14 15 0 0.91 0.97 0.94 300 16 1 0.80 0.65 0.71 300 17 2 0.74 0.84 0.79 300 18 19 accuracy 0.82 900 20 macro avg 0.82 0.82 0.81 900 21weighted avg 0.82 0.82 0.81 900 22 23Confusion Matrix: 24[[290 8 2] 25 [ 22 194 84] 26 [ 7 42 251]]
如果想自己动手实战的朋友,考虑参考我的代码 https://www.kaggle.com/code/wuxiognwei2/lora-llama2-7b