import numpy as np

f = open('data/pubmed/Pubmed-Diabetes.NODE.paper.tab', 'r')
num = 0
fields = []
features = np.zeros((19717, 502))
for line in f:
    if num == 1:
        entries = line.split()
        entries = entries[1:-1]
        for thing in entries:
            thing = thing.split(':')
            fields.append(thing[1])
    if num > 1:
        entries = line.split()
        node = int(entries[0])
        features[num-2, 0] = node
        for thing in entries[2:-1]:
            thing = thing.split('=')
            features[num-2, fields.index(thing[0]) + 1] = float(thing[1]) 
        features[num-2, -1] = int(entries[1].split('=')[1])
    num += 1
f.close()
np.savetxt('pubmed.content', features, fmt = '%d ' + '%f '*(features.shape[1] - 2) + '%d') 


f = open('data/pubmed/Pubmed-Diabetes.DIRECTED.cites.tab', 'r')
edges = []
num = 0
for line in f:
    if num > 1:
        entries = line.split()
        node1 = entries[1]
        node2 = entries[3]
        edges.append((int(node1.split(':')[1]), int(node2.split(':')[1])))
    num +=1

edges = np.array(edges, dtype=np.int)
np.savetxt('pubmed.cites', edges, fmt = '%d')