Comments (4)
Hello!
Sorry for any inconvenience caused to your operation. We generate historical information based on the RE-NET (Recurrent Event Network: Autoregressive Structure Inference over Temporary Knowledge Graphs) data preprocessing function. The code has not been optimized yet, and we do need to use large memory to generate data. If you have optimized the code, we look forward to sharing/push with us in the Github repository.
Best wishes
from cenet.
The following code of get_history_graph.py can be refined and will be a huge memory saver, but still requires about 64GB of memory.
for ix in tqdm.tqdm(range(quadruples.shape[0])):
s_history_oid.append([])
for con_events in s_history_event_o[ix]:
cur_events = con_events[:, 1].tolist()
s_history_oid[-1] += cur_events
s_history_label_true = np.zeros((quadruples.shape[0], 1))
for ix in tqdm.tqdm(range(quadruples.shape[0])):
if oo[ix] in s_history_oid[ix]:
s_history_label_true[ix] = 1
return s_history_label_true
to
s_history_label_true = np.zeros((quadruples.shape[0], 1))
for ix in tqdm.tqdm(range(quadruples.shape[0])):
hist_tmp = []
for con_events in s_history_event_o[ix]:
cur_events = con_events[:, 1].tolist()
hist_tmp += cur_events
hist_tmp = set(hist_tmp)
if oo[ix] in hist_tmp:
s_history_label_true[ix] = 1
return s_history_label_true
There are many similar parts of the code that can be optimized following this logic.
from cenet.
from cenet.
@HuangRiKui
Thank you. Hope helpful for other guys, this is my code.
import numpy as np
import os
from collections import defaultdict
import pickle
import dgl
import torch
import tqdm
import gc
from scipy.sparse import csc_matrix
print('GDELT')
def load_quadruples(inPath, fileName, fileName2=None):
with open(os.path.join(inPath, fileName), 'r') as fr:
quadrupleList = []
times = set()
for line in fr:
line_split = line.split()
head = int(line_split[0])
tail = int(line_split[2])
rel = int(line_split[1])
time = int(line_split[3])
quadrupleList.append([head, rel, tail, time])
times.add(time)
# times = list(times)
# times.sort()
if fileName2 is not None:
with open(os.path.join(inPath, fileName2), 'r') as fr:
for line in fr:
line_split = line.split()
head = int(line_split[0])
tail = int(line_split[2])
rel = int(line_split[1])
time = int(line_split[3])
quadrupleList.append([head, rel, tail, time])
times.add(time)
times = list(times)
times.sort()
return np.asarray(quadrupleList), np.asarray(times)
def get_total_number(inPath, fileName):
with open(os.path.join(inPath, fileName), 'r') as fr:
for line in fr:
line_split = line.split()
return int(line_split[0]), int(line_split[1])
def load_quadruples(inPath, fileName, fileName2=None):
with open(os.path.join(inPath, fileName), 'r') as fr:
quadrupleList = []
times = set()
for line in fr:
line_split = line.split()
head = int(line_split[0])
tail = int(line_split[2])
rel = int(line_split[1])
time = int(line_split[3])
quadrupleList.append([head, rel, tail, time])
times.add(time)
# times = list(times)
# times.sort()
if fileName2 is not None:
with open(os.path.join(inPath, fileName2), 'r') as fr:
for line in fr:
line_split = line.split()
head = int(line_split[0])
tail = int(line_split[2])
rel = int(line_split[1])
time = int(line_split[3])
quadrupleList.append([head, rel, tail, time])
times.add(time)
times = list(times)
times.sort()
return np.array(quadrupleList), np.asarray(times)
def get_data_with_t(data, tim):
triples = [[quad[0], quad[1], quad[2]] for quad in data if quad[3] == tim]
return np.array(triples)
def comp_deg_norm(g):
in_deg = g.in_degrees(range(g.number_of_nodes())).float()
in_deg[torch.nonzero(in_deg == 0).view(-1)] = 1
norm = 1.0 / in_deg
return norm
def get_big_graph(data, num_rels):
src, rel, dst = data.transpose()
uniq_v, edges = np.unique((src, dst), return_inverse=True)
src, dst = np.reshape(edges, (2, -1))
g = dgl.DGLGraph()
g.add_nodes(len(uniq_v))
src, dst = np.concatenate((src, dst)), np.concatenate((dst, src))
rel_o = np.concatenate((rel + num_rels, rel))
rel_s = np.concatenate((rel, rel + num_rels))
g.add_edges(src, dst)
norm = comp_deg_norm(g)
g.ndata.update({'id': torch.from_numpy(uniq_v).long().view(-1, 1), 'norm': norm.view(-1, 1)})
g.edata['type_s'] = torch.LongTensor(rel_s)
g.edata['type_o'] = torch.LongTensor(rel_o)
g.ids = {}
idx = 0
for id in uniq_v:
g.ids[id] = idx
idx += 1
return g
def get_history_target(quadruples, s_history_event_o, o_history_event_s, actor, target=None):
if target is not None:
if target == 'label':
if actor == 's':
s_history_oid = []
ss = quadruples[:, 0]
rr = quadruples[:, 1]
oo = quadruples[:, 2]
s_history_label_true = np.zeros((quadruples.shape[0], 1))
for ix in tqdm.tqdm(range(quadruples.shape[0])):
hist_tmp = []
for con_events in s_history_event_o[ix]:
cur_events = con_events[:, 1].tolist()
hist_tmp += cur_events
hist_tmp = set(hist_tmp)
if oo[ix] in hist_tmp:
s_history_label_true[ix] = 1
return s_history_label_true
else:
o_history_sid = []
ss = quadruples[:, 0]
oo = quadruples[:, 2]
o_history_label_true = np.zeros((quadruples.shape[0], 1))
for ix in tqdm.tqdm(range(quadruples.shape[0])):
hist_tmp = []
for con_events in o_history_event_s[ix]:
cur_events = con_events[:, 1].tolist()
hist_tmp += cur_events
hist_tmp = set(hist_tmp)
if ss[ix] in hist_tmp:
o_history_label_true[ix] = 1
return o_history_label_true
else:
if actor == 's':
rr = quadruples[:, 1]
s_history_related = np.zeros((quadruples.shape[0], num_e), dtype=np.float)
for ix in tqdm.tqdm(range(quadruples.shape[0])):
for con_events in s_history_event_o[ix]:
idxx = (con_events[:, 0] == rr[ix]).nonzero()[0]
cur_events = con_events[idxx, 1].tolist()
s_history_related[ix][cur_events] += 1
s_history_related = csc_matrix(s_history_related)
return s_history_related
else:
rr = quadruples[:, 1]
o_history_related = np.zeros((quadruples.shape[0], num_e), dtype=np.float)
for ix in tqdm.tqdm(range(quadruples.shape[0])):
for con_events in o_history_event_s[ix]:
idxx = (con_events[:, 0] == rr[ix]).nonzero()[0]
cur_events = con_events[idxx, 1].tolist()
o_history_related[ix][cur_events] += 1
o_history_related = csc_matrix(o_history_related)
return o_history_related
##############################
else:
if actor == 's':
ss = quadruples[:, 0]
rr = quadruples[:, 1]
oo = quadruples[:, 2]
s_history_related = np.zeros((quadruples.shape[0], num_e), dtype=np.float)
s_history_label_true = np.zeros((quadruples.shape[0], 1))
for ix in tqdm.tqdm(range(quadruples.shape[0])):
hist_tmp = []
for con_events in s_history_event_o[ix]:
idxx = (con_events[:, 0] == rr[ix]).nonzero()[0]
cur_events = con_events[idxx, 1].tolist()
hist_tmp += con_events[:, 1].tolist()
s_history_related[ix][cur_events] += 1
hist_tmp = set(hist_tmp)
if oo[ix] in hist_tmp:
s_history_label_true[ix] = 1
s_history_related = csc_matrix(s_history_related)
return s_history_label_true, s_history_related
else:
ss = quadruples[:, 0]
rr = quadruples[:, 1]
oo = quadruples[:, 2]
o_history_related = np.zeros((quadruples.shape[0], num_e), dtype=np.float)
o_history_label_true = np.zeros((quadruples.shape[0], 1))
for ix in tqdm.tqdm(range(quadruples.shape[0])):
hist_tmp = []
for con_events in o_history_event_s[ix]:
idxx = (con_events[:, 0] == rr[ix]).nonzero()[0]
cur_events = con_events[idxx, 1].tolist()
hist_tmp += con_events[:, 1].tolist()
o_history_related[ix][cur_events] += 1
hist_tmp = set(hist_tmp)
if ss[ix] in hist_tmp:
o_history_label_true[ix] = 1
o_history_related = csc_matrix(o_history_related)
return o_history_label_true, o_history_related
train_data, train_times = load_quadruples('', 'train.txt')
test_data, test_times = load_quadruples('', 'test.txt')
dev_data, dev_times = load_quadruples('', 'valid.txt')
# total_data, _ = load_quadruples('', 'train.txt', 'test.txt')
num_e, num_r = get_total_number('', 'stat.txt')
s_his = [[] for _ in range(num_e)]
o_his = [[] for _ in range(num_e)]
s_his_t = [[] for _ in range(num_e)]
o_his_t = [[] for _ in range(num_e)]
s_history_data = [[] for _ in range(len(train_data))]
o_history_data = [[] for _ in range(len(train_data))]
s_history_data_t = [[] for _ in range(len(train_data))]
o_history_data_t = [[] for _ in range(len(train_data))]
e = []
r = []
latest_t = 0
s_his_cache = [[] for _ in range(num_e)]
o_his_cache = [[] for _ in range(num_e)]
s_his_cache_t = [None for _ in range(num_e)]
o_his_cache_t = [None for _ in range(num_e)]
for i, train in enumerate(train_data):
if i % 10000 == 0:
print("train", i, len(train_data))
# if i == 10000:
# break
t = train[3]
if latest_t != t:
for ee in range(num_e):
if len(s_his_cache[ee]) != 0:
s_his[ee].append(s_his_cache[ee].copy())
s_his_t[ee].append(s_his_cache_t[ee])
s_his_cache[ee] = []
s_his_cache_t[ee] = None
if len(o_his_cache[ee]) != 0:
o_his[ee].append(o_his_cache[ee].copy())
o_his_t[ee].append(o_his_cache_t[ee])
o_his_cache[ee] = []
o_his_cache_t[ee] = None
latest_t = t
s = train[0]
r = train[1]
o = train[2]
# print(s_his[r][s])
s_history_data[i] = s_his[s].copy()
o_history_data[i] = o_his[o].copy()
s_history_data_t[i] = s_his_t[s].copy()
o_history_data_t[i] = o_his_t[o].copy()
# print(o_history_data_g[i])
if len(s_his_cache[s]) == 0:
s_his_cache[s] = np.array([[r, o]])
else:
s_his_cache[s] = np.concatenate((s_his_cache[s], [[r, o]]), axis=0)
s_his_cache_t[s] = t
if len(o_his_cache[o]) == 0:
o_his_cache[o] = np.array([[r, s]])
else:
o_his_cache[o] = np.concatenate((o_his_cache[o], [[r, s]]), axis=0)
o_his_cache_t[o] = t
# print(s_history_data[i], s_history_data_g[i])
# with open('ttt.txt', 'wb') as fp:
# pickle.dump(s_history_data_g, fp)
# print("save")
with open('train_history_sub.txt', 'wb') as fp:
pickle.dump([s_history_data, s_history_data_t], fp)
with open('train_history_ob.txt', 'wb') as fp:
pickle.dump([o_history_data, o_history_data_t], fp)
s_label_train = get_history_target(train_data, s_history_data, o_history_data, 's', 'label')
with open('train_s_label.txt', 'wb') as fp:
pickle.dump(s_label_train, fp)
del s_label_train
gc.collect()
s_history_related_train = get_history_target(train_data, s_history_data, o_history_data, 's', 'related')
torch.save(s_history_related_train, 'train_s_frequency.txt')
del s_history_related_train
gc.collect()
o_label_train = get_history_target(train_data, s_history_data, o_history_data, 'o', 'label')
with open('train_o_label.txt', 'wb') as fp:
pickle.dump(o_label_train, fp)
del o_label_train
gc.collect()
o_history_related_train = get_history_target(train_data, s_history_data, o_history_data, 'o', 'related')
torch.save(o_history_related_train, 'train_o_frequency.txt')
del o_history_related_train
del train_data
del s_history_data
del o_history_data
gc.collect()
# print(s_history_data[0])
s_history_data_dev = [[] for _ in range(len(dev_data))]
o_history_data_dev = [[] for _ in range(len(dev_data))]
s_history_data_dev_t = [[] for _ in range(len(dev_data))]
o_history_data_dev_t = [[] for _ in range(len(dev_data))]
for i, dev in enumerate(dev_data):
if i % 10000 == 0:
print("valid", i, len(dev_data))
t = dev[3]
if latest_t != t:
for ee in range(num_e):
if len(s_his_cache[ee]) != 0:
s_his_t[ee].append(s_his_cache_t[ee])
s_his[ee].append(s_his_cache[ee].copy())
s_his_cache[ee] = []
s_his_cache_t[ee] = None
if len(o_his_cache[ee]) != 0:
o_his_t[ee].append(o_his_cache_t[ee])
o_his[ee].append(o_his_cache[ee].copy())
o_his_cache[ee] = []
o_his_cache_t[ee] = None
latest_t = t
s = dev[0]
r = dev[1]
o = dev[2]
s_history_data_dev[i] = s_his[s].copy()
o_history_data_dev[i] = o_his[o].copy()
s_history_data_dev_t[i] = s_his_t[s].copy()
o_history_data_dev_t[i] = o_his_t[o].copy()
if len(s_his_cache[s]) == 0:
s_his_cache[s] = np.array([[r, o]])
else:
s_his_cache[s] = np.concatenate((s_his_cache[s], [[r, o]]), axis=0)
s_his_cache_t[s] = t
if len(o_his_cache[o]) == 0:
o_his_cache[o] = np.array([[r, s]])
else:
o_his_cache[o] = np.concatenate((o_his_cache[o], [[r, s]]), axis=0)
o_his_cache_t[o] = t
# print(o_his_cache[o])
with open('dev_history_sub.txt', 'wb') as fp:
pickle.dump([s_history_data_dev, s_history_data_dev_t], fp)
with open('dev_history_ob.txt', 'wb') as fp:
pickle.dump([o_history_data_dev, o_history_data_dev_t], fp)
s_label_dev, s_history_related_dev = get_history_target(dev_data, s_history_data_dev, o_history_data_dev, 's')
with open('dev_s_label.txt', 'wb') as fp:
pickle.dump(s_label_dev, fp)
with open('dev_s_frequency.txt', 'wb') as fp:
pickle.dump(s_history_related_dev, fp)
del s_label_dev
del s_history_related_dev
gc.collect()
o_label_dev, o_history_related_dev = get_history_target(dev_data, s_history_data_dev, o_history_data_dev, 'o')
with open('dev_o_label.txt', 'wb') as fp:
pickle.dump(o_label_dev, fp)
with open('dev_o_frequency.txt', 'wb') as fp:
pickle.dump(o_history_related_dev, fp)
del o_label_dev
del o_history_related_dev
gc.collect()
s_history_data_test = [[] for _ in range(len(test_data))]
o_history_data_test = [[] for _ in range(len(test_data))]
s_history_data_test_t = [[] for _ in range(len(test_data))]
o_history_data_test_t = [[] for _ in range(len(test_data))]
for i, test in enumerate(test_data):
if i % 10000 == 0:
print("test", i, len(test_data))
t = test[3]
if latest_t != t:
for ee in range(num_e):
if len(s_his_cache[ee]) != 0:
s_his_t[ee].append(s_his_cache_t[ee])
s_his[ee].append(s_his_cache[ee].copy())
s_his_cache[ee] = []
s_his_cache_t[ee] = None
if len(o_his_cache[ee]) != 0:
o_his_t[ee].append(o_his_cache_t[ee])
o_his[ee].append(o_his_cache[ee].copy())
o_his_cache[ee] = []
o_his_cache_t[ee] = None
latest_t = t
s = test[0]
r = test[1]
o = test[2]
s_history_data_test[i] = s_his[s].copy()
o_history_data_test[i] = o_his[o].copy()
s_history_data_test_t[i] = s_his_t[s].copy()
o_history_data_test_t[i] = o_his_t[o].copy()
if len(s_his_cache[s]) == 0:
# s_his_cache[s] = np.array([[r, o]])
pass
else:
pass
# s_his_cache[s] = np.concatenate((s_his_cache[s], [[r, o]]), axis=0)
s_his_cache_t[s] = t
if len(o_his_cache[o]) == 0:
pass
# o_his_cache[o] = np.array([[r, s]])
else:
pass
# o_his_cache[o] = np.concatenate((o_his_cache[o], [[r, s]]), axis=0)
o_his_cache_t[o] = t
# print(o_his_cache[o])
with open('test_history_sub.txt', 'wb') as fp:
pickle.dump([s_history_data_test, s_history_data_test_t], fp)
with open('test_history_ob.txt', 'wb') as fp:
pickle.dump([o_history_data_test, o_history_data_test_t], fp)
s_label_test, s_history_related_test = get_history_target(test_data, s_history_data_test, o_history_data_test, 's')
with open('test_s_label.txt', 'wb') as fp:
pickle.dump(s_label_test, fp)
with open('test_s_frequency.txt', 'wb') as fp:
pickle.dump(s_history_related_test, fp)
del s_label_test
del s_history_related_test
gc.collect()
o_label_test, o_history_related_test = get_history_target(test_data, s_history_data_test, o_history_data_test, 'o')
with open('test_o_label.txt', 'wb') as fp:
pickle.dump(o_label_test, fp)
with open('test_o_frequency.txt', 'wb') as fp:
pickle.dump(o_history_related_test, fp)
del o_label_test
del o_history_related_test
gc.collect()
from cenet.
Related Issues (16)
- Part of the code is missing... HOT 1
- #Data file read issues! HOT 2
- why set some values to lambda or -lambda HOT 1
- 关于get_history_targer中s_history_label_true的疑问,以及cenet_model.py中对history_tag的疑问 HOT 1
- Question about the hyperparameters setting HOT 1
- Failed to prepare history graph HOT 1
- Oracle HOT 1
- 标签命名 HOT 1
- 环境配置 HOT 1
- Request for the evaluation of RE-GCN HOT 3
- 复现的环境配置问题
- Parameter set
- Validation set HOT 1
- the experiment performance HOT 2
- Can you explain how you obtain filtered version and its reason? I don't understand the following code. HOT 2
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from cenet.