word2token小进展
论文精读 ·更改链接: 没有知识的 生成任务 prompt主要是分类任务中或者基于知识的任务比较适合。
mark 一下
由于[SEP]标记的目的是充当两个句子之间的分隔符,因此它符合您使用[SEP]标记分隔查询和答案序列的目标。
您还可以尝试添加不同的标记来标记查询的开始和结束,或者将答案标记为
num_added_toks = tokenizer.add_tokens(['[EOT]'], special_tokens=True) ##This line is updated
model.resize_token_embeddings(len(tokenizer))
###The tokenizer has to be saved if it has to be reused
tokenizer.save_pretrained(<output_dir>)
很不错的source学习如何将文字转换成向量。
分割线
mark一下:
向tokenizor中添加特殊token的写法很不错。
for train_id in id[0]:
new_list = []
line_is = train_id[0]
line = deque([line_is])
dp_id = deque(train_id)
dp_id.popleft()
print(dp_id)
while dp_id != deque([]):
now_num = dp_id.popleft()
if line_is==now_num:
line.append(now_num)
else:
if len(list(line)) >= 2:# insert the list
new_list.append(list(line))
line = deque([now_num])
elif len(list(line)) == 1:# just append to list
new_list.append(line[0])
line = deque([now_num])
line_is = now_num
if dp_id == deque([]):
if len(list(line)) >= 2:# insert the list
new_list.append(list(line))
elif len(list(line)) == 1:# just append to list
new_list.append(line[0])
不好意思,这段代码我不知道错在哪里了,不过真的达不到预期要求。
for train_id in id[0]:
# print(train_id)
new_id,temp_list = [train_id[0]],list()
left,right = 0,1
length = len(train_id)
while left < length -1:
print(train_id)
print('left is {},right is {} and length is {}'.format(left,right,length))
if train_id[left] != train_id[right] and temp_list != [] and left != 0:
# print('new_id',new_id)
# print('temp_list',temp_list)
if new_id[-1] == temp_list[0]:
del new_id[-1]
new_id.append(temp_list)
temp_list = []
left = right
if right < length:
right += 1
elif train_id[left] != train_id[right] and temp_list == []:
# new_id.append(train_id[left])
new_id.append(train_id[right])
left = right
if right < length:
right += 1
elif train_id[left] == train_id[right] and temp_list != []:
temp_list.append(train_id[right])
if right < length:
right += 1
elif train_id[left] == train_id[right] and temp_list == []:
temp_list.append(train_id[left])
temp_list.append(train_id[right])
if right < length:
right += 1
if right >= length -1:
right = length - 1
if temp_list != []:
new_id.append(temp_list)
left = right
# print(new_id,temp_list)
print('train_id\t',train_id)
print('new_id\t',new_id)
for i in range(0,len(train_id)):
if train_id[i] != pin and temp_list == []:
new_id.append(train_id[i])
pin = train_id[i]
elif train_id[i] != pin and temp_list != []:
new_id.append(temp_list)
new_id.append(train_id[i])
temp_list = list()
pin = train_id[i]
elif train_id[i] == pin and i != 0:
temp_list.append(pin)
elif train_id[i] == pin and i == 0:
new_id.append(pin)
print(new_id)
改进版,将dialog也融入其中
def regroup_process(self,dialog,id):
'''
if utterance
each group with the size of batch_size
in different list: means must in different group, plug in
bit_count variance for origin fold
input: [1,0,0,1,0,1,1],[u0,u1,u2,u3,u4,u5,u6]
process:[1,[0,0],1,0,[1,1]], ...
output [u0,'u1[SEP]u2',u3,u4,'u5[SEP]u6']
'''
new_dialog_all = []
for train_id,train_dialog in zip(id[0],dialog[0]):
new_id,new_dialogs = [],[]
line_is = train_id[0]
first_dialog = train_dialog[0]
line = deque([line_is])
dialog_line = deque([first_dialog])
dp_id,dp_dialog = deque(train_id),deque(train_dialog)
dp_id.popleft(),dp_dialog.popleft()
# print(dp_id)
while dp_id != deque([]):
now_num,now_dialog = dp_id.popleft(),dp_dialog.popleft()
if line_is==now_num:
line.append(now_num),dialog_line.append(now_dialog)
else:
if len(list(line)) >= 2:# insert the list
new_id.append(list(line)),new_dialogs.append('[SEP]'.join(list(dialog_line)))# seperate it wit special token# 图片信息输入,在此。如果存在''。就说明有图片。
line,dialog_line = deque([now_num]),deque([now_dialog])
elif len(list(line)) == 1:# just append to list
new_id.append(line[0]),new_dialogs.append(dialog_line[0])
line,dialog_line = deque([now_num]),deque([now_dialog])
line_is = now_num
if dp_id == deque([]):
if len(list(line)) >= 2:# insert the list we need to insert the same size of dialogues and deal with it to the dialog group
new_id.append(list(line))
new_dialogs.append('[SEP]'.join(list(dialog_line)))
elif len(list(line)) == 1:# just append to list
new_id.append(line[0])
new_dialogs.append(dialog_line[0])
new_dialog_all.append(new_dialogs)
return new_dialog_all