import json

removed_labeled = []
with open('labeled.json') as f:
    sentence = json.load(f)
    for item in sentence:
        sen = item['sent']
        sen_split = sen.split(' ')
        for i in range(len(sen_split)):
            if sen_split[i].startswith('OBJ') or sen_split[i].startswith('SUBJ'):
                sen_split[i] = '###'
        new_sen = ' '.join(sen_split)
        new_sen = new_sen.replace('### ', '')
        removed_labeled.append(new_sen)
print(len(removed_labeled))
no_entity_sens = []
with open('./tacred_raw/train.json') as f:
    sentence = json.load(f)
    for line in sentence:
        rel = line['relation']
        tokens = line['token']
        for i in range(line['subj_start'], line['subj_end'] + 1):
            tokens[i] = "###"
        for i in range(line['obj_start'], line['obj_end'] + 1):
            tokens[i] = "###"
        no_entity_tokens = ' '.join(tokens)
        no_entity_tokens = no_entity_tokens.replace('### ', '')
        no_entity_sens.append(no_entity_tokens)

target = []
rel_list = []
with open('./tacred_raw/train.json') as f:
    sentence = json.load(f)
    for line in sentence:
        rel = line['relation']
        rel_list.append(rel)
        tokens = line['token']
        tokens[line['subj_start']] = '#subj#' + tokens[line['subj_start']]
        tokens[line['subj_end']] = tokens[line['subj_end']] + '#/subj#'
        tokens[line['obj_start']] = '#obj#' + tokens[line['obj_start']]
        tokens[line['obj_end']] = tokens[line['obj_end']] + '#/obj#'
        target.append({'sent': ' '.join(tokens), 'rel': rel})

new_labeled = []
new_unlabeled = []
for i in range(len(no_entity_sens)):
    if no_entity_sens[i] in removed_labeled:
        new_labeled.append(target[i])
    else:
        new_unlabeled.append(target[i])


with open('new_labeled.json', 'w') as f:
    json.dump(new_labeled, f, indent=2)

with open('new_unlabeled.json', 'w') as f:
    json.dump(new_unlabeled, f, indent=2)