基礎(chǔ)數(shù)據(jù)準(zhǔn)備
訓(xùn)練所需要的數(shù)據(jù)集合都存儲在數(shù)據(jù)庫中,還有部分文本文件
首先對數(shù)據(jù)進(jìn)行分類結(jié)構(gòu)化存儲[因?yàn)樯婕暗降氖嵌喾诸悊栴}]
整理并存儲原始數(shù)據(jù)集
使用numpy將所有需要數(shù)據(jù)讀取出來
splitlines() ==> 按照r n 或者rn分割
import numpy as np
import pandas as pd
values1 = np.array(open(r'text1.txt', 'r', encoding='utf-8').read().splitlines())
values2 = np.random.choice(open(r'text2.txt', 'r', encoding='utf-8').read().splitlines(),100000)
設(shè)計(jì)標(biāo)識符
label_map = {
1: 'values1',
2: 'values2',
}
將所有數(shù)據(jù)進(jìn)行拼接
data = np.concatenate([values1,values2])
生產(chǎn)相應(yīng)數(shù)量的標(biāo)識
lable = np.concatenate([np.array([4]*len(values1)),np.array([5]*len(values2))])
生成DataFrame數(shù)據(jù)結(jié)構(gòu)
df = pd.DataFrame({"data":data,"lable":lable})
提取數(shù)據(jù)結(jié)構(gòu)中多余的字符
df.replace('r|n|!', '', inplace=True, regex=True)
將整合后的原始數(shù)據(jù)存儲為csv文件
df.to_csv("dataset.csv",sep="!",index=False,header=False)
使數(shù)據(jù)集向量化
from keras.preprocessing.sequence import pad_sequences # 對序列進(jìn)行預(yù)處理生成長度相同的序列
from keras.utils.np_utils import to_categorical # 將標(biāo)簽轉(zhuǎn)換為 one-hot 編碼
對每個(gè)字符進(jìn)行old操作
def process(s: str):
s = str(s).lower()
return [ord(c) for c in s]
data = df['data'].Apply(process).values
將序列處理成相同長度的數(shù)組
MAX_SEQUENCE_LENGTH = 30
data = pad_sequences(data, maxlen=MAX_SEQUENCE_LENGTH,dtype='int',padding='post',truncating='post')
去除數(shù)組內(nèi)重復(fù)數(shù)字并進(jìn)行排序之后輸出
palette = np.unique(data)
獲取每個(gè)字符在palette中的位置
data = np.digitize(data, palette, right=True)
將標(biāo)簽轉(zhuǎn)化為 one-hot 編碼
labels = to_categorical(df['lable'].values)
劃分訓(xùn)練子集與測試子集
from sklearn.model_selection import train_test_split
train_data, val_data, train_label, val_label = train_test_split(data, labels, test_size=0.2, random_state=42)
print('train data shape: ', train_data.shape, ' train label shape: ', train_label.shape)
print('val data shape: ', val_data.shape, ' val label shape: ', val_label.shape)
#人工智能##深度學(xué)習(xí)##AI科技#