当前位置：首页 > news >正文

瞬间将模型改为原来的60-200倍小

news 2025/7/1 17:39:38

代码

import paddle
import faiss
from new_model_13 import GPT as GPT13

import pandas as pd
from sklearn.preprocessing import normalize
import json
import math
from collections import Counter
from tqdm import tqdm
import numpy as np


#  36 36
def gen_small_voc():
    num = "0123456789" + 'qwertyuiopasdfghjklzxcvbnm' + "QWERTYUIOPASDFGHJKLZXCVBNM"
    num = list(num)
    small_em_voc = dict()

    voc_id = 0
    for i in range(16):
        for n in num:
            small_em_voc[voc_id] = "{}_{}".format(i, n)
            voc_id += 1
    return small_em_voc


def random_gen_voc():
    num = "0123456789" + 'qwertyuiopasdfghjklzxcvbnm' + "QWERTYUIOPASDFGHJKLZXCVBNM"
    num = list(num)
    p_list = ["{}_{}".format(i, np.random.choice(num)) for i in range(16)]
    return "#".join(p_list)


def gen_text_voc_to_token_id(text, large_em_voc, small_voc_em):
    text = list(text)
    text_list = []
    for ii in text:
        one = large_em_voc.get(ii, None)
        if one is None:
            while True:

                two = random_gen_voc()
                if large_em_voc.get(two, None) is None:
                    large_em_voc[two] = ii
                    large_em_voc[ii] = two
                    two = [small_voc_em.get(i) for i in two.split("#")]
                    text_list.append(two)
                    break
        else:
            two = [small_voc_em.get(i) for i in one.split("#")]
            text_list.append(two)

    return text_list, large_em_voc


def train():
    with open("唐诗.json", "r", encoding="utf-8") as f:
        data = f.read()
    data = json.loads(data)
    data = [i[4].split() for i in data if len(i[4].split()) > 3]
    data = np.hstack(data)
    data = [i for i in data if len("".join(i.split())) == 24 and "a" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "f" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "e" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "h" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "X" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "“" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and '□' not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and '《' not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and '》' not in i]

    small_em_voc = gen_small_voc()
    small_voc_em = {k: v for v, k in small_em_voc.items()}
    large_em_voc = dict()

    model = GPT13(len(small_em_voc), 512, 32, 8)
    # model.load_dict(paddle.load("gpt.pdparams"))
    print("参数量:",
          sum([i.shape[0] * i.shape[-1] if len(i.shape) > 1 else i.shape[-1] for i in model.parameters()]) / 1000000000,
          "B")
    loss_func = paddle.nn.CrossEntropyLoss()
    opt = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=0.0003)

    for epoch in range(190):
        bar = tqdm(range(0, len(data), 1000))
        for i in bar:
            j = i + 1000

            large_data = []
            for one in data[i:j]:
                two, large_em_voc = gen_text_voc_to_token_id(one, large_em_voc, small_voc_em)

                large_data.append(two)

            out, _ = model(paddle.to_tensor(large_data)[:, :-1])
            loss = loss_func(out, paddle.to_tensor(large_data)[:, 1:].reshape([out.shape[0], -1]))
            bar.set_description("epoch___{}__loss__{}".format(epoch, loss.item()))
            opt.clear_grad()
            loss.backward()
            opt.step()
        paddle.save(model.state_dict(), "duo_yang_xing.pkl")
        pd.to_pickle(large_em_voc, "large_em_voc.pkl")
        pd.to_pickle(small_em_voc, "small_em_voc.pkl")


def val():
    with open("唐诗.json", "r", encoding="utf-8") as f:
        data = f.read()
    data = json.loads(data)
    data = [i[4].split() for i in data if len(i[4].split()) > 3]
    data = np.hstack(data)
    data = [i for i in data if len("".join(i.split())) == 24 and "a" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "f" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "e" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "h" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "X" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "“" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and '□' not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and '《' not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and '》' not in i]

    small_em_voc = pd.read_pickle("small_em_voc.pkl")
    small_voc_em = {k: v for v, k in small_em_voc.items()}
    large_em_voc = pd.read_pickle("large_em_voc.pkl")

    model = GPT13(len(small_em_voc), 512, 32, 8)
    model.load_dict(paddle.load("duo_yang_xing.pkl"))
    model.eval()

    print("参数量:",
          sum([i.shape[0] * i.shape[-1] if len(i.shape) > 1 else i.shape[-1] for i in model.parameters()]) / 1000000000,
          "B")

    word = data[38][:10]
    df_large_voc = pd.DataFrame([i.split("#") for i in large_em_voc.keys() if len(i) > 10])
    top_k=10
    for _ in range(17):

        two, large_em_voc = gen_text_voc_to_token_id(word, large_em_voc, small_voc_em)
        out, _ = model(paddle.to_tensor(two).unsqueeze(0))

        # 使用top_k
        top = paddle.topk(out, top_k)
        top = top[1][0, -16:]

        top_0 = [[small_em_voc.get(i.item()) for i in top_0] for top_0 in top.T]


        df_out=pd.DataFrame(top_0)
        df_large_voc_copy=df_large_voc.copy()
        for i in range(16):

            df_large_voc_copy=df_large_voc_copy[df_large_voc_copy[i].isin(df_out[i])]
            if len(df_large_voc_copy)<3:
                break

        # 进制 取数
        if df_large_voc_copy.empty:
            top_k+=1
            continue



        word += large_em_voc.get("#".join(df_large_voc_copy.values.tolist()[0]))


        print(word)
        top_k=10


if __name__ == '__main__':
    # train()
    val()

解释

这段代码的目的是创建一个词到ID的映射，以便于将文本数据转换为机器学习模型可以理解的数字格式。具体来说，这个映射是通过以下步骤构建的：

gen_small_voc 函数创建了一个包含所有可能字符（数字、大小写字母）的列表，并为每个字符生成了一个唯一的ID。这个ID是通过将字符与其在列表中的位置组合而成的。
random_gen_voc 函数随机选择16个字符，并为它们生成一个唯一的ID。这个ID是通过将字符与其在列表中的位置组合而成的。
gen_text_voc_to_token_id 函数接受一个文本字符串、一个大词表和一个小词表作为输入。对于文本中的每个字符，函数首先检查它是否已经在大词表中。如果不在，函数就会随机生成一个新的ID，并将其添加到大词表中。然后，函数将这个字符的ID（无论是已经存在的还是新创建的）转换为一个整数列表，并将其添加到输出列表中。
这个构建词表的过程的主要优点是，它可以处理任何文本数据，即使数据中包含未知的字符。这是因为如果遇到一个未知的字符，函数会自动为它生成一个新的ID，并将其添加到大词表中。这使得这个方法非常灵活，可以处理各种不同的文本数据。

import math


import paddle
import paddle.nn as nn


class MaxState(paddle.nn.Layer):
    def __init__(self, hidden_dim, heads, win):
        super(MaxState, self).__init__()

        assert hidden_dim % heads == 0, "Hidden size must be divisible by the number of heads."

        self.head_size = hidden_dim // heads
        # self.head =paddle.nn.Linear(hidden_dim,2*hidden_dim,bias_attr=False)
        self.head = paddle.nn.Linear(hidden_dim, hidden_dim, bias_attr=False)
        # self.head_out =paddle.nn.Linear(hidden_dim*2,hidden_dim,bias_attr=False)
        self.head_num = heads
        self.win = win
        self.hidden = hidden_dim
        self.mask = paddle.triu(paddle.ones([win, win]))

    def forward(self, input_data, state=None):
        b, s, k, h, w = input_data.shape[0], input_data.shape[1], self.head_num, self.head_size, self.win

        window = paddle.ones([1, w])

        out = self.head(input_data)

        out = out.unsqueeze(-1) @ window

        out = out.transpose([0, 2, 1, 3])

        one_list = []
        if state is None:
            state = paddle.ones([out.shape[0], out.shape[1], 1, 1]) * float("-inf")
        for i in range(0, s, w):
            j = w + i
            one = out[:, :, i:j]
            _, _, r, c = one.shape
            if r != self.win:

                one = paddle.where(self.mask[:r, :], one, paddle.to_tensor(-float('inf')))
            else:
                one = paddle.where(self.mask, one, paddle.to_tensor(-float('inf')))

            one = paddle.concat([one, state @ window], axis=2)
            state = paddle.max(one, axis=2, keepdim=True)
            one = state.reshape([b, k, h, w])
            state = state[..., -1:]
            if r != self.win:
                one = one[..., :r]

            one = one.transpose([0, 3, 1, 2])
            one_list.append(one)
        out = paddle.concat(one_list, 1)
        out = out.reshape([b, s, -1])
        # out = self.head_out(out)

        return out, state



class FeedForward(nn.Layer):
    def __init__(self, hidden_size):
        super(FeedForward, self).__init__()

        self.ffn1 = nn.Linear(hidden_size, hidden_size * 2)
        self.ffn2 = nn.Linear(hidden_size * 2, hidden_size)
        self.gate = nn.Linear(hidden_size, hidden_size * 2)
        self.relu = nn.Silu()

    def forward(self, x):
        x1 = self.ffn1(x)
        x2 = self.relu(self.gate(x))
        x = x1 * x2
        x = self.ffn2(x)
        return x


class RMSNorm(nn.Layer):
    def __init__(self, dim, eps: float = 1e-6):
        super(RMSNorm, self).__init__()
        self.eps = eps
        self.fc = paddle.create_parameter(shape=[dim], dtype='float32',
                                          default_initializer=nn.initializer.Constant(value=1.0))

    def norm(self, x):
        return x * paddle.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        output = self.norm(x)

        return output * self.fc


class GPTDecoderLayer(nn.Layer):
    def __init__(self, hidden_size, num_heads):
        super(GPTDecoderLayer, self).__init__()
        # self.self_attention = MaskMultiHeadAttention(hidden_size, num_heads)
        self.self_attention = MaxState(hidden_size, num_heads, 8)
        self.ffn = FeedForward(hidden_size)
        self.norm = nn.LayerNorm(hidden_size)
        self.norm1 = RMSNorm(hidden_size)

    def forward(self, x, state=None, seq_len=None):
        x1, state = self.self_attention(x, state)  # Self-Attention with residual connection
        x = x1 + x
        x = self.norm(x)

        x = self.ffn(x) + x  # Feed-Forward with residual connection
        x = self.norm1(x)
        return x, state


class PositionalEncoding(nn.Layer):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        # Create a long enough Paddle array to hold position encodings for the maximum sequence length
        position = paddle.arange(max_len).unsqueeze(1).astype("float32")
        # Create a constant 'pe' matrix with the same size as the embedding matrix
        div_term = paddle.exp(paddle.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = paddle.zeros([max_len, d_model])
        pe[:, 0::2] = paddle.sin(position * div_term)
        pe[:, 1::2] = paddle.cos(position * div_term)
        self.pe = pe.unsqueeze(0)  # Shape: [1, max_len, d_model]
        # Register 'pe' as a buffer (non-trainable parameter)

    def forward(self, x, seq_len=None):
        # x is of shape [batch_size, seq_len, d_model]

        if seq_len is None:
            seq_len = x.shape[1]
            return x + self.pe[:, :seq_len, :]
        else:
            return x + self.pe[:, seq_len - 1:seq_len, :]


# %%

def sinusoidal_position_embedding(max_len, output_dim):
    # (max_len, 1)
    position = paddle.arange(0, max_len, dtype="float32").unsqueeze(-1)
    # (output_dim//2)
    ids = paddle.arange(0, output_dim // 2, dtype="float32")  # 即公式里的i, i的范围是 [0,d/2]
    theta = 10000 ** (-2 * ids / output_dim)
    # (max_len, output_dim//2)
    embeddings = position * theta  # 即公式里的：pos / (10000^(2i/d))
    sin_embeddings = paddle.sin(embeddings)
    cos_embeddings = paddle.cos(embeddings)
    return sin_embeddings, cos_embeddings


def rope(q, sin_em, cos_em, seq_len=None):
    if seq_len is None:

        sin_em = sin_em[:q.shape[2]]
        cos_em = cos_em[:q.shape[2]]

    else:
        sin_em = sin_em[seq_len - 1:seq_len]
        cos_em = cos_em[seq_len - 1:seq_len]

    q1 = q.reshape([q.shape[0], q.shape[1], q.shape[2], -1, 2])[..., 1]
    q2 = q.reshape([q.shape[0], q.shape[1], q.shape[2], -1, 2])[..., 0]
    # 奇数负值*sin_em+偶数正值*cos_em  奇数正值*cos_em+偶数正值*sin_em

    q3 = paddle.stack([-q1 * sin_em + q2 * cos_em, q1 * cos_em + q2 * sin_em], -1)
    q = q3.reshape(q.shape)  # reshape后就是正负交替了
    return q


class GPT(nn.Layer):
    def __init__(self, vocab_size, hidden_size, num_heads, num_layers):
        super(GPT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.label_embedding = nn.Embedding(vocab_size, hidden_size)

        self.decoder_layers = nn.LayerList([GPTDecoderLayer(hidden_size, num_heads) for _ in range(num_layers)])
        self.fc = nn.Linear(hidden_size, vocab_size, bias_attr=False)
        self.sin_em, self.cos_em = sinusoidal_position_embedding(50000, hidden_size // num_heads // 2)
        self.conv=paddle.nn.Conv1D(1,16,kernel_size=3,padding=1,bias_attr=False)
        self.out = nn.Linear(16, 16, bias_attr=False)

        self.layer_nor= paddle.nn.LayerNorm(hidden_size)
        # self.rms_norm=RMSNorm(hidden_size)

    def forward(self, xx, state=None, seq_len=None):
        xx = self.embedding(xx)
        # x = self.position_embedding(x, seq_len)
        x = paddle.max(xx, -2)




        if state is None:
            state = [None] * len(self.decoder_layers)

        i = 0
        x = rope(x.reshape([x.shape[0], x.shape[1], -1, self.sin_em.shape[1] * 2]).transpose([0, 2, 1, 3]),
                 self.sin_em,
                 self.cos_em, seq_len).transpose([0, 2, 1, 3]).reshape(x.shape) + x
        for decoder_layer in self.decoder_layers:
            x1, state[i] = decoder_layer(x, state[i])
            x = x1 + x
            i += 1

        # out = self.fc(self.rms_norm(x))
        out = self.conv(x.reshape([-1, 1, x.shape[-1]]))+xx.reshape([-1, 16, x.shape[-1]])
        out = out.reshape([x.shape[0],-1,x.shape[-1]])
        out = self.fc(self.layer_nor(out))
        return out, state

随机版本

def val():
    with open("唐诗.json", "r", encoding="utf-8") as f:
        data = f.read()
    data = json.loads(data)
    data = [i[4].split() for i in data if len(i[4].split()) > 3]
    data = np.hstack(data)
    data = [i for i in data if len("".join(i.split())) == 24 and "a" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "f" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "e" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "h" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "X" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "“" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and '□' not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and '《' not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and '》' not in i]

    small_em_voc = pd.read_pickle("small_em_voc.pkl")
    small_voc_em = {k: v for v, k in small_em_voc.items()}
    large_em_voc = pd.read_pickle("large_em_voc.pkl")

    model = GPT13(len(small_em_voc), 512, 32, 8)
    model.load_dict(paddle.load("duo_yang_xing.pkl"))
    model.eval()

    print("参数量:",
          sum([i.shape[0] * i.shape[-1] if len(i.shape) > 1 else i.shape[-1] for i in model.parameters()]) / 1000000000,
          "B")

    word = data[38][:10]
    df_large_voc = pd.DataFrame([i.split("#") for i in large_em_voc.keys() if len(i) > 10])
    top_k=10
    for _ in range(17):

        two, large_em_voc = gen_text_voc_to_token_id(word, large_em_voc, small_voc_em)
        out, _ = model(paddle.to_tensor(two).unsqueeze(0))

        # 使用top_k
        top = paddle.topk(out, top_k)
        top = top[1][0, -16:]

        top_0 = [[small_em_voc.get(i.item()) for i in top_0] for top_0 in top.T]


        df_out=pd.DataFrame(top_0)
        df_large_voc_copy=df_large_voc.copy()
        for i in np.random.choice(list(range(16)),16,replace=False):

            df_large_voc_copy=df_large_voc_copy[df_large_voc_copy[i].isin(df_out[i])]
            if len(df_large_voc_copy)<3:
                break

        # 进制 取数
        if df_large_voc_copy.empty:
            top_k+=1
            continue



        word += large_em_voc.get("#".join(df_large_voc_copy.values.tolist()[0]))


        print(word)
        top_k=10

向量版本

def val():
    with open("唐诗.json", "r", encoding="utf-8") as f:
        data = f.read()
    data = json.loads(data)
    data = [i[4].split() for i in data if len(i[4].split()) > 3]
    data = np.hstack(data)
    data = [i for i in data if len("".join(i.split())) == 24 and "a" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "f" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "e" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "h" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "X" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and "“" not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and '□' not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and '《' not in i]
    data = [i for i in data if len("".join(i.split())) == 24 and '》' not in i]

    small_em_voc = pd.read_pickle("small_em_voc.pkl")
    small_voc_em = {k: v for v, k in small_em_voc.items()}
    large_em_voc = pd.read_pickle("large_em_voc.pkl")

    model = GPT13(len(small_em_voc), 512, 32, 8)
    model.load_dict(paddle.load("duo_yang_xing.pkl"))
    model.eval()

    print("参数量:",
          sum([i.shape[0] * i.shape[-1] if len(i.shape) > 1 else i.shape[-1] for i in model.parameters()]) / 1000000000,
          "B")

    k_list = []
    v_list = []

    for k, v in large_em_voc.items():
        if len(k) <= 1:
            # one = paddle.max(
            #     model.embedding(paddle.to_tensor([small_voc_em.get(i) for i in v.split("#")]).reshape([1, -1])), 1)
            one =model.embedding(paddle.to_tensor([small_voc_em.get(i) for i in v.split("#")]).reshape([1, -1]))
            # faiss_index.add(one)
            v_list.append(one)
            k_list.append(k)

    word = data[0][:10]
    for _ in range(17):
        two, large_em_voc = gen_text_voc_to_token_id(word, large_em_voc, small_voc_em)
        out, _ = model(paddle.to_tensor(two).unsqueeze(0))
        out = paddle.argmax(out, -1)[:, -16:]
        out_num = [small_em_voc.get(i.item()) for i in out[0]]
        out_voc = large_em_voc.get("#".join(out_num))
        if out_voc is None:
            # out_em = paddle.max(model.embedding(out), 1)
            out_em = model.embedding(out)
            out_sort = np.argsort([paddle.nn.functional.cosine_similarity(out_em.reshape([1,-1]), i.reshape([1,-1])).item() for i in v_list])
            word += k_list[out_sort[-1]]
        else:
            word += out_voc
        print(word)