<a target="_blank" rel="noopener noreferrer nofollow" href="https://user-images.github

thanks a lot. wish you a happy life <span class="email-hidden-togg

<a class="user-mention notranslate" data-hovercard-type="user" data-hovercard-url="/us

It seems that it not impossible to handle the datasate "GDELT"!!!!!!!! about cenet HOT 4 CLOSED

xyjigsaw commented on September 26, 2024

It seems that it not impossible to handle the datasate "GDELT"!!!!!!!!

from cenet.

Comments (4)

xyjigsaw commented on September 26, 2024

Hello!
Sorry for any inconvenience caused to your operation. We generate historical information based on the RE-NET (Recurrent Event Network: Autoregressive Structure Inference over Temporary Knowledge Graphs) data preprocessing function. The code has not been optimized yet, and we do need to use large memory to generate data. If you have optimized the code, we look forward to sharing/push with us in the Github repository.
Best wishes

from cenet.

HuangRiKui commented on September 26, 2024

The following code of get_history_graph.py can be refined and will be a huge memory saver, but still requires about 64GB of memory.

for ix in tqdm.tqdm(range(quadruples.shape[0])):
    s_history_oid.append([])
    for con_events in s_history_event_o[ix]:
        cur_events = con_events[:, 1].tolist()
        s_history_oid[-1] += cur_events
s_history_label_true = np.zeros((quadruples.shape[0], 1))

for ix in tqdm.tqdm(range(quadruples.shape[0])):
    if oo[ix] in s_history_oid[ix]:
        s_history_label_true[ix] = 1
return s_history_label_true

s_history_label_true = np.zeros((quadruples.shape[0], 1))
for ix in tqdm.tqdm(range(quadruples.shape[0])):
    hist_tmp = []
    for con_events in s_history_event_o[ix]:
        cur_events = con_events[:, 1].tolist()
       hist_tmp += cur_events
    hist_tmp = set(hist_tmp)
    if oo[ix] in hist_tmp:
        s_history_label_true[ix] = 1
return s_history_label_true

There are many similar parts of the code that can be optimized following this logic.

from cenet.

binchen4110 commented on September 26, 2024

thanks a lot. wish you a happy life

…

------------------ 原始邮件 ------------------ 发件人: ***@***.***>; 发送时间: 2023年6月27日(星期二) 晚上8:39 收件人: ***@***.***>; 抄送: ***@***.***>; ***@***.***>; 主题: Re: [xyjigsaw/CENET] It seems that it not impossible to handle the datasate "GDELT"!!!!!!!! (Issue #6) The following code of get_history_graph.py can be refined and will be a huge memory saver, but still requires about 64GB of memory. for ix in tqdm.tqdm(range(quadruples.shape[0])): s_history_oid.append([]) for con_events in s_history_event_o[ix]: cur_events = con_events[:, 1].tolist() s_history_oid[-1] += cur_events s_history_label_true = np.zeros((quadruples.shape[0], 1)) for ix in tqdm.tqdm(range(quadruples.shape[0])): if oo[ix] in s_history_oid[ix]: s_history_label_true[ix] = 1 return s_history_label_true to s_history_label_true = np.zeros((quadruples.shape[0], 1)) for ix in tqdm.tqdm(range(quadruples.shape[0])): hist_tmp = [] for con_events in s_history_event_o[ix]: cur_events = con_events[:, 1].tolist() hist_tmp += cur_events hist_tmp = set(hist_tmp) if oo[ix] in hist_tmp: s_history_label_true[ix] = 1 return s_history_label_true There are many similar parts of the code that can be optimized following this logic. — Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you authored the thread.Message ID: ***@***.***>

from cenet.

roadwide commented on September 26, 2024

@HuangRiKui
Thank you. Hope helpful for other guys, this is my code.

import numpy as np
import os
from collections import defaultdict
import pickle
import dgl
import torch
import tqdm
import gc
from scipy.sparse import csc_matrix

print('GDELT')


def load_quadruples(inPath, fileName, fileName2=None):
    with open(os.path.join(inPath, fileName), 'r') as fr:
        quadrupleList = []
        times = set()
        for line in fr:
            line_split = line.split()
            head = int(line_split[0])
            tail = int(line_split[2])
            rel = int(line_split[1])
            time = int(line_split[3])
            quadrupleList.append([head, rel, tail, time])
            times.add(time)
        # times = list(times)
        # times.sort()
    if fileName2 is not None:
        with open(os.path.join(inPath, fileName2), 'r') as fr:
            for line in fr:
                line_split = line.split()
                head = int(line_split[0])
                tail = int(line_split[2])
                rel = int(line_split[1])
                time = int(line_split[3])
                quadrupleList.append([head, rel, tail, time])
                times.add(time)
    times = list(times)
    times.sort()

    return np.asarray(quadrupleList), np.asarray(times)


def get_total_number(inPath, fileName):
    with open(os.path.join(inPath, fileName), 'r') as fr:
        for line in fr:
            line_split = line.split()
            return int(line_split[0]), int(line_split[1])


def load_quadruples(inPath, fileName, fileName2=None):
    with open(os.path.join(inPath, fileName), 'r') as fr:
        quadrupleList = []
        times = set()
        for line in fr:
            line_split = line.split()
            head = int(line_split[0])
            tail = int(line_split[2])
            rel = int(line_split[1])
            time = int(line_split[3])
            quadrupleList.append([head, rel, tail, time])
            times.add(time)
        # times = list(times)
        # times.sort()
    if fileName2 is not None:
        with open(os.path.join(inPath, fileName2), 'r') as fr:
            for line in fr:
                line_split = line.split()
                head = int(line_split[0])
                tail = int(line_split[2])
                rel = int(line_split[1])
                time = int(line_split[3])
                quadrupleList.append([head, rel, tail, time])
                times.add(time)
    times = list(times)
    times.sort()

    return np.array(quadrupleList), np.asarray(times)


def get_data_with_t(data, tim):
    triples = [[quad[0], quad[1], quad[2]] for quad in data if quad[3] == tim]
    return np.array(triples)


def comp_deg_norm(g):
    in_deg = g.in_degrees(range(g.number_of_nodes())).float()
    in_deg[torch.nonzero(in_deg == 0).view(-1)] = 1
    norm = 1.0 / in_deg
    return norm


def get_big_graph(data, num_rels):
    src, rel, dst = data.transpose()
    uniq_v, edges = np.unique((src, dst), return_inverse=True)
    src, dst = np.reshape(edges, (2, -1))
    g = dgl.DGLGraph()
    g.add_nodes(len(uniq_v))
    src, dst = np.concatenate((src, dst)), np.concatenate((dst, src))
    rel_o = np.concatenate((rel + num_rels, rel))
    rel_s = np.concatenate((rel, rel + num_rels))
    g.add_edges(src, dst)
    norm = comp_deg_norm(g)
    g.ndata.update({'id': torch.from_numpy(uniq_v).long().view(-1, 1), 'norm': norm.view(-1, 1)})
    g.edata['type_s'] = torch.LongTensor(rel_s)
    g.edata['type_o'] = torch.LongTensor(rel_o)
    g.ids = {}
    idx = 0
    for id in uniq_v:
        g.ids[id] = idx
        idx += 1
    return g

def get_history_target(quadruples, s_history_event_o, o_history_event_s, actor, target=None):
    if target is not None:
        if target == 'label':
            if actor == 's':
                s_history_oid = []
                ss = quadruples[:, 0]
                rr = quadruples[:, 1]
                oo = quadruples[:, 2]
                
                s_history_label_true = np.zeros((quadruples.shape[0], 1))
                for ix in tqdm.tqdm(range(quadruples.shape[0])):
                    hist_tmp = []
                    for con_events in s_history_event_o[ix]:
                        cur_events = con_events[:, 1].tolist()
                        hist_tmp += cur_events
                    hist_tmp = set(hist_tmp)
                    if oo[ix] in hist_tmp:
                        s_history_label_true[ix] = 1
                return s_history_label_true

            else:
                o_history_sid = []
                ss = quadruples[:, 0]
                oo = quadruples[:, 2]
                
                o_history_label_true = np.zeros((quadruples.shape[0], 1))
                for ix in tqdm.tqdm(range(quadruples.shape[0])):
                    hist_tmp = []
                    for con_events in o_history_event_s[ix]:
                        cur_events = con_events[:, 1].tolist()
                        hist_tmp += cur_events
                    hist_tmp = set(hist_tmp)
                    if ss[ix] in hist_tmp:
                        o_history_label_true[ix] = 1
                return o_history_label_true
        else:
            if actor == 's':
                rr = quadruples[:, 1]
                s_history_related = np.zeros((quadruples.shape[0], num_e), dtype=np.float)
                for ix in tqdm.tqdm(range(quadruples.shape[0])):
                    for con_events in s_history_event_o[ix]:
                        idxx = (con_events[:, 0] == rr[ix]).nonzero()[0]
                        cur_events = con_events[idxx, 1].tolist()
                        s_history_related[ix][cur_events] += 1
                s_history_related = csc_matrix(s_history_related)
                return s_history_related
            else:
                rr = quadruples[:, 1]
                o_history_related = np.zeros((quadruples.shape[0], num_e), dtype=np.float)
                for ix in tqdm.tqdm(range(quadruples.shape[0])):
                    for con_events in o_history_event_s[ix]:
                        idxx = (con_events[:, 0] == rr[ix]).nonzero()[0]
                        cur_events = con_events[idxx, 1].tolist()
                        o_history_related[ix][cur_events] += 1
                o_history_related = csc_matrix(o_history_related)
                return o_history_related
    ##############################
    else:
        if actor == 's':
            ss = quadruples[:, 0]
            rr = quadruples[:, 1]
            oo = quadruples[:, 2]

            s_history_related = np.zeros((quadruples.shape[0], num_e), dtype=np.float)
            s_history_label_true = np.zeros((quadruples.shape[0], 1))
            for ix in tqdm.tqdm(range(quadruples.shape[0])):
                hist_tmp = []
                for con_events in s_history_event_o[ix]:
                    idxx = (con_events[:, 0] == rr[ix]).nonzero()[0]
                    cur_events = con_events[idxx, 1].tolist()
                    hist_tmp += con_events[:, 1].tolist()
                    s_history_related[ix][cur_events] += 1
                hist_tmp = set(hist_tmp)
                if oo[ix] in hist_tmp:
                    s_history_label_true[ix] = 1
            s_history_related = csc_matrix(s_history_related)
            return s_history_label_true, s_history_related
        else:
            ss = quadruples[:, 0]
            rr = quadruples[:, 1]
            oo = quadruples[:, 2]
            o_history_related = np.zeros((quadruples.shape[0], num_e), dtype=np.float)
            o_history_label_true = np.zeros((quadruples.shape[0], 1))

            for ix in tqdm.tqdm(range(quadruples.shape[0])):
                hist_tmp = []
                for con_events in o_history_event_s[ix]:
                    idxx = (con_events[:, 0] == rr[ix]).nonzero()[0]
                    cur_events = con_events[idxx, 1].tolist()
                    hist_tmp += con_events[:, 1].tolist()
                    o_history_related[ix][cur_events] += 1
                hist_tmp = set(hist_tmp)
                if ss[ix] in hist_tmp:
                    o_history_label_true[ix] = 1
            o_history_related = csc_matrix(o_history_related)
            return o_history_label_true, o_history_related

train_data, train_times = load_quadruples('', 'train.txt')
test_data, test_times = load_quadruples('', 'test.txt')
dev_data, dev_times = load_quadruples('', 'valid.txt')
# total_data, _ = load_quadruples('', 'train.txt', 'test.txt')

num_e, num_r = get_total_number('', 'stat.txt')

s_his = [[] for _ in range(num_e)]
o_his = [[] for _ in range(num_e)]
s_his_t = [[] for _ in range(num_e)]
o_his_t = [[] for _ in range(num_e)]
s_history_data = [[] for _ in range(len(train_data))]
o_history_data = [[] for _ in range(len(train_data))]
s_history_data_t = [[] for _ in range(len(train_data))]
o_history_data_t = [[] for _ in range(len(train_data))]
e = []
r = []
latest_t = 0
s_his_cache = [[] for _ in range(num_e)]
o_his_cache = [[] for _ in range(num_e)]
s_his_cache_t = [None for _ in range(num_e)]
o_his_cache_t = [None for _ in range(num_e)]


for i, train in enumerate(train_data):
    if i % 10000 == 0:
        print("train", i, len(train_data))
    # if i == 10000:
    #     break
    t = train[3]
    if latest_t != t:

        for ee in range(num_e):
            if len(s_his_cache[ee]) != 0:

                s_his[ee].append(s_his_cache[ee].copy())
                s_his_t[ee].append(s_his_cache_t[ee])
                s_his_cache[ee] = []
                s_his_cache_t[ee] = None
            if len(o_his_cache[ee]) != 0:

                o_his[ee].append(o_his_cache[ee].copy())
                o_his_t[ee].append(o_his_cache_t[ee])
                o_his_cache[ee] = []
                o_his_cache_t[ee] = None
        latest_t = t
    s = train[0]
    r = train[1]
    o = train[2]
    # print(s_his[r][s])
    s_history_data[i] = s_his[s].copy()
    o_history_data[i] = o_his[o].copy()
    s_history_data_t[i] = s_his_t[s].copy()
    o_history_data_t[i] = o_his_t[o].copy()
    # print(o_history_data_g[i])

    if len(s_his_cache[s]) == 0:
        s_his_cache[s] = np.array([[r, o]])
    else:
        s_his_cache[s] = np.concatenate((s_his_cache[s], [[r, o]]), axis=0)
    s_his_cache_t[s] = t

    if len(o_his_cache[o]) == 0:
        o_his_cache[o] = np.array([[r, s]])
    else:
        o_his_cache[o] = np.concatenate((o_his_cache[o], [[r, s]]), axis=0)
    o_his_cache_t[o] = t

    # print(s_history_data[i], s_history_data_g[i])
    # with open('ttt.txt', 'wb') as fp:
    #     pickle.dump(s_history_data_g, fp)
    # print("save")

with open('train_history_sub.txt', 'wb') as fp:
    pickle.dump([s_history_data, s_history_data_t], fp)
with open('train_history_ob.txt', 'wb') as fp:
    pickle.dump([o_history_data, o_history_data_t], fp)

s_label_train = get_history_target(train_data, s_history_data, o_history_data, 's', 'label')
with open('train_s_label.txt', 'wb') as fp:
    pickle.dump(s_label_train, fp)
del s_label_train
gc.collect()

s_history_related_train = get_history_target(train_data, s_history_data, o_history_data, 's', 'related')
torch.save(s_history_related_train, 'train_s_frequency.txt')
del s_history_related_train
gc.collect()

o_label_train = get_history_target(train_data, s_history_data, o_history_data, 'o', 'label')
with open('train_o_label.txt', 'wb') as fp:
    pickle.dump(o_label_train, fp)
del o_label_train
gc.collect()

o_history_related_train = get_history_target(train_data, s_history_data, o_history_data, 'o', 'related')
torch.save(o_history_related_train, 'train_o_frequency.txt')
del o_history_related_train
del train_data
del s_history_data
del o_history_data
gc.collect()


# print(s_history_data[0])
s_history_data_dev = [[] for _ in range(len(dev_data))]
o_history_data_dev = [[] for _ in range(len(dev_data))]
s_history_data_dev_t = [[] for _ in range(len(dev_data))]
o_history_data_dev_t = [[] for _ in range(len(dev_data))]

for i, dev in enumerate(dev_data):
    if i % 10000 == 0:
        print("valid", i, len(dev_data))
    t = dev[3]
    if latest_t != t:
        for ee in range(num_e):
            if len(s_his_cache[ee]) != 0:
                s_his_t[ee].append(s_his_cache_t[ee])
                s_his[ee].append(s_his_cache[ee].copy())
                s_his_cache[ee] = []
                s_his_cache_t[ee] = None
            if len(o_his_cache[ee]) != 0:

                o_his_t[ee].append(o_his_cache_t[ee])
                o_his[ee].append(o_his_cache[ee].copy())

                o_his_cache[ee] = []
                o_his_cache_t[ee] = None
        latest_t = t
    s = dev[0]
    r = dev[1]
    o = dev[2]
    s_history_data_dev[i] = s_his[s].copy()
    o_history_data_dev[i] = o_his[o].copy()
    s_history_data_dev_t[i] = s_his_t[s].copy()
    o_history_data_dev_t[i] = o_his_t[o].copy()
    if len(s_his_cache[s]) == 0:
        s_his_cache[s] = np.array([[r, o]])
    else:
        s_his_cache[s] = np.concatenate((s_his_cache[s], [[r, o]]), axis=0)
    s_his_cache_t[s] = t

    if len(o_his_cache[o]) == 0:
        o_his_cache[o] = np.array([[r, s]])
    else:
        o_his_cache[o] = np.concatenate((o_his_cache[o], [[r, s]]), axis=0)
    o_his_cache_t[o] = t

    # print(o_his_cache[o])

with open('dev_history_sub.txt', 'wb') as fp:
    pickle.dump([s_history_data_dev, s_history_data_dev_t], fp)
with open('dev_history_ob.txt', 'wb') as fp:
    pickle.dump([o_history_data_dev, o_history_data_dev_t], fp)

s_label_dev, s_history_related_dev = get_history_target(dev_data, s_history_data_dev, o_history_data_dev, 's')
with open('dev_s_label.txt', 'wb') as fp:
    pickle.dump(s_label_dev, fp)
with open('dev_s_frequency.txt', 'wb') as fp:
    pickle.dump(s_history_related_dev, fp)
del s_label_dev
del s_history_related_dev
gc.collect()


o_label_dev, o_history_related_dev = get_history_target(dev_data, s_history_data_dev, o_history_data_dev, 'o')
with open('dev_o_label.txt', 'wb') as fp:
    pickle.dump(o_label_dev, fp)
with open('dev_o_frequency.txt', 'wb') as fp:
    pickle.dump(o_history_related_dev, fp)
del o_label_dev
del o_history_related_dev
gc.collect()

s_history_data_test = [[] for _ in range(len(test_data))]
o_history_data_test = [[] for _ in range(len(test_data))]

s_history_data_test_t = [[] for _ in range(len(test_data))]
o_history_data_test_t = [[] for _ in range(len(test_data))]

for i, test in enumerate(test_data):
    if i % 10000 == 0:
        print("test", i, len(test_data))
    t = test[3]
    if latest_t != t:
        for ee in range(num_e):
            if len(s_his_cache[ee]) != 0:
                s_his_t[ee].append(s_his_cache_t[ee])

                s_his[ee].append(s_his_cache[ee].copy())
                s_his_cache[ee] = []
                s_his_cache_t[ee] = None
            if len(o_his_cache[ee]) != 0:

                o_his_t[ee].append(o_his_cache_t[ee])

                o_his[ee].append(o_his_cache[ee].copy())
                o_his_cache[ee] = []
                o_his_cache_t[ee] = None
        latest_t = t
    s = test[0]
    r = test[1]
    o = test[2]
    s_history_data_test[i] = s_his[s].copy()
    o_history_data_test[i] = o_his[o].copy()
    s_history_data_test_t[i] = s_his_t[s].copy()
    o_history_data_test_t[i] = o_his_t[o].copy()
    if len(s_his_cache[s]) == 0:
        # s_his_cache[s] = np.array([[r, o]])
        pass
    else:
        pass
        # s_his_cache[s] = np.concatenate((s_his_cache[s], [[r, o]]), axis=0)
    s_his_cache_t[s] = t

    if len(o_his_cache[o]) == 0:
        pass
        # o_his_cache[o] = np.array([[r, s]])
    else:
        pass
        # o_his_cache[o] = np.concatenate((o_his_cache[o], [[r, s]]), axis=0)
    o_his_cache_t[o] = t
    # print(o_his_cache[o])


with open('test_history_sub.txt', 'wb') as fp:
    pickle.dump([s_history_data_test, s_history_data_test_t], fp)
with open('test_history_ob.txt', 'wb') as fp:
    pickle.dump([o_history_data_test, o_history_data_test_t], fp)

s_label_test, s_history_related_test = get_history_target(test_data, s_history_data_test, o_history_data_test, 's')
with open('test_s_label.txt', 'wb') as fp:
    pickle.dump(s_label_test, fp)
with open('test_s_frequency.txt', 'wb') as fp:
    pickle.dump(s_history_related_test, fp)
del s_label_test
del s_history_related_test
gc.collect()


o_label_test, o_history_related_test = get_history_target(test_data, s_history_data_test, o_history_data_test, 'o')
with open('test_o_label.txt', 'wb') as fp:
    pickle.dump(o_label_test, fp)
with open('test_o_frequency.txt', 'wb') as fp:
    pickle.dump(o_history_related_test, fp)
del o_label_test
del o_history_related_test
gc.collect()

from cenet.

It seems that it not impossible to handle the datasate "GDELT"!!!!!!!! about cenet HOT 4 CLOSED

Comments (4)

Related Issues (16)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent