Code Monkey home page Code Monkey logo

Comments (4)

xyjigsaw avatar xyjigsaw commented on September 26, 2024

Hello!
Sorry for any inconvenience caused to your operation. We generate historical information based on the RE-NET (Recurrent Event Network: Autoregressive Structure Inference over Temporary Knowledge Graphs) data preprocessing function. The code has not been optimized yet, and we do need to use large memory to generate data. If you have optimized the code, we look forward to sharing/push with us in the Github repository.
Best wishes

from cenet.

HuangRiKui avatar HuangRiKui commented on September 26, 2024

The following code of get_history_graph.py can be refined and will be a huge memory saver, but still requires about 64GB of memory.

for ix in tqdm.tqdm(range(quadruples.shape[0])):
    s_history_oid.append([])
    for con_events in s_history_event_o[ix]:
        cur_events = con_events[:, 1].tolist()
        s_history_oid[-1] += cur_events
s_history_label_true = np.zeros((quadruples.shape[0], 1))

for ix in tqdm.tqdm(range(quadruples.shape[0])):
    if oo[ix] in s_history_oid[ix]:
        s_history_label_true[ix] = 1
return s_history_label_true

to

s_history_label_true = np.zeros((quadruples.shape[0], 1))
for ix in tqdm.tqdm(range(quadruples.shape[0])):
    hist_tmp = []
    for con_events in s_history_event_o[ix]:
        cur_events = con_events[:, 1].tolist()
       hist_tmp += cur_events
    hist_tmp = set(hist_tmp)
    if oo[ix] in hist_tmp:
        s_history_label_true[ix] = 1
return s_history_label_true

There are many similar parts of the code that can be optimized following this logic.

from cenet.

binchen4110 avatar binchen4110 commented on September 26, 2024

from cenet.

roadwide avatar roadwide commented on September 26, 2024

@HuangRiKui
Thank you. Hope helpful for other guys, this is my code.

import numpy as np
import os
from collections import defaultdict
import pickle
import dgl
import torch
import tqdm
import gc
from scipy.sparse import csc_matrix

print('GDELT')


def load_quadruples(inPath, fileName, fileName2=None):
    with open(os.path.join(inPath, fileName), 'r') as fr:
        quadrupleList = []
        times = set()
        for line in fr:
            line_split = line.split()
            head = int(line_split[0])
            tail = int(line_split[2])
            rel = int(line_split[1])
            time = int(line_split[3])
            quadrupleList.append([head, rel, tail, time])
            times.add(time)
        # times = list(times)
        # times.sort()
    if fileName2 is not None:
        with open(os.path.join(inPath, fileName2), 'r') as fr:
            for line in fr:
                line_split = line.split()
                head = int(line_split[0])
                tail = int(line_split[2])
                rel = int(line_split[1])
                time = int(line_split[3])
                quadrupleList.append([head, rel, tail, time])
                times.add(time)
    times = list(times)
    times.sort()

    return np.asarray(quadrupleList), np.asarray(times)


def get_total_number(inPath, fileName):
    with open(os.path.join(inPath, fileName), 'r') as fr:
        for line in fr:
            line_split = line.split()
            return int(line_split[0]), int(line_split[1])


def load_quadruples(inPath, fileName, fileName2=None):
    with open(os.path.join(inPath, fileName), 'r') as fr:
        quadrupleList = []
        times = set()
        for line in fr:
            line_split = line.split()
            head = int(line_split[0])
            tail = int(line_split[2])
            rel = int(line_split[1])
            time = int(line_split[3])
            quadrupleList.append([head, rel, tail, time])
            times.add(time)
        # times = list(times)
        # times.sort()
    if fileName2 is not None:
        with open(os.path.join(inPath, fileName2), 'r') as fr:
            for line in fr:
                line_split = line.split()
                head = int(line_split[0])
                tail = int(line_split[2])
                rel = int(line_split[1])
                time = int(line_split[3])
                quadrupleList.append([head, rel, tail, time])
                times.add(time)
    times = list(times)
    times.sort()

    return np.array(quadrupleList), np.asarray(times)


def get_data_with_t(data, tim):
    triples = [[quad[0], quad[1], quad[2]] for quad in data if quad[3] == tim]
    return np.array(triples)


def comp_deg_norm(g):
    in_deg = g.in_degrees(range(g.number_of_nodes())).float()
    in_deg[torch.nonzero(in_deg == 0).view(-1)] = 1
    norm = 1.0 / in_deg
    return norm


def get_big_graph(data, num_rels):
    src, rel, dst = data.transpose()
    uniq_v, edges = np.unique((src, dst), return_inverse=True)
    src, dst = np.reshape(edges, (2, -1))
    g = dgl.DGLGraph()
    g.add_nodes(len(uniq_v))
    src, dst = np.concatenate((src, dst)), np.concatenate((dst, src))
    rel_o = np.concatenate((rel + num_rels, rel))
    rel_s = np.concatenate((rel, rel + num_rels))
    g.add_edges(src, dst)
    norm = comp_deg_norm(g)
    g.ndata.update({'id': torch.from_numpy(uniq_v).long().view(-1, 1), 'norm': norm.view(-1, 1)})
    g.edata['type_s'] = torch.LongTensor(rel_s)
    g.edata['type_o'] = torch.LongTensor(rel_o)
    g.ids = {}
    idx = 0
    for id in uniq_v:
        g.ids[id] = idx
        idx += 1
    return g

def get_history_target(quadruples, s_history_event_o, o_history_event_s, actor, target=None):
    if target is not None:
        if target == 'label':
            if actor == 's':
                s_history_oid = []
                ss = quadruples[:, 0]
                rr = quadruples[:, 1]
                oo = quadruples[:, 2]
                
                s_history_label_true = np.zeros((quadruples.shape[0], 1))
                for ix in tqdm.tqdm(range(quadruples.shape[0])):
                    hist_tmp = []
                    for con_events in s_history_event_o[ix]:
                        cur_events = con_events[:, 1].tolist()
                        hist_tmp += cur_events
                    hist_tmp = set(hist_tmp)
                    if oo[ix] in hist_tmp:
                        s_history_label_true[ix] = 1
                return s_history_label_true

            else:
                o_history_sid = []
                ss = quadruples[:, 0]
                oo = quadruples[:, 2]
                
                o_history_label_true = np.zeros((quadruples.shape[0], 1))
                for ix in tqdm.tqdm(range(quadruples.shape[0])):
                    hist_tmp = []
                    for con_events in o_history_event_s[ix]:
                        cur_events = con_events[:, 1].tolist()
                        hist_tmp += cur_events
                    hist_tmp = set(hist_tmp)
                    if ss[ix] in hist_tmp:
                        o_history_label_true[ix] = 1
                return o_history_label_true
        else:
            if actor == 's':
                rr = quadruples[:, 1]
                s_history_related = np.zeros((quadruples.shape[0], num_e), dtype=np.float)
                for ix in tqdm.tqdm(range(quadruples.shape[0])):
                    for con_events in s_history_event_o[ix]:
                        idxx = (con_events[:, 0] == rr[ix]).nonzero()[0]
                        cur_events = con_events[idxx, 1].tolist()
                        s_history_related[ix][cur_events] += 1
                s_history_related = csc_matrix(s_history_related)
                return s_history_related
            else:
                rr = quadruples[:, 1]
                o_history_related = np.zeros((quadruples.shape[0], num_e), dtype=np.float)
                for ix in tqdm.tqdm(range(quadruples.shape[0])):
                    for con_events in o_history_event_s[ix]:
                        idxx = (con_events[:, 0] == rr[ix]).nonzero()[0]
                        cur_events = con_events[idxx, 1].tolist()
                        o_history_related[ix][cur_events] += 1
                o_history_related = csc_matrix(o_history_related)
                return o_history_related
    ##############################
    else:
        if actor == 's':
            ss = quadruples[:, 0]
            rr = quadruples[:, 1]
            oo = quadruples[:, 2]

            s_history_related = np.zeros((quadruples.shape[0], num_e), dtype=np.float)
            s_history_label_true = np.zeros((quadruples.shape[0], 1))
            for ix in tqdm.tqdm(range(quadruples.shape[0])):
                hist_tmp = []
                for con_events in s_history_event_o[ix]:
                    idxx = (con_events[:, 0] == rr[ix]).nonzero()[0]
                    cur_events = con_events[idxx, 1].tolist()
                    hist_tmp += con_events[:, 1].tolist()
                    s_history_related[ix][cur_events] += 1
                hist_tmp = set(hist_tmp)
                if oo[ix] in hist_tmp:
                    s_history_label_true[ix] = 1
            s_history_related = csc_matrix(s_history_related)
            return s_history_label_true, s_history_related
        else:
            ss = quadruples[:, 0]
            rr = quadruples[:, 1]
            oo = quadruples[:, 2]
            o_history_related = np.zeros((quadruples.shape[0], num_e), dtype=np.float)
            o_history_label_true = np.zeros((quadruples.shape[0], 1))

            for ix in tqdm.tqdm(range(quadruples.shape[0])):
                hist_tmp = []
                for con_events in o_history_event_s[ix]:
                    idxx = (con_events[:, 0] == rr[ix]).nonzero()[0]
                    cur_events = con_events[idxx, 1].tolist()
                    hist_tmp += con_events[:, 1].tolist()
                    o_history_related[ix][cur_events] += 1
                hist_tmp = set(hist_tmp)
                if ss[ix] in hist_tmp:
                    o_history_label_true[ix] = 1
            o_history_related = csc_matrix(o_history_related)
            return o_history_label_true, o_history_related

train_data, train_times = load_quadruples('', 'train.txt')
test_data, test_times = load_quadruples('', 'test.txt')
dev_data, dev_times = load_quadruples('', 'valid.txt')
# total_data, _ = load_quadruples('', 'train.txt', 'test.txt')

num_e, num_r = get_total_number('', 'stat.txt')

s_his = [[] for _ in range(num_e)]
o_his = [[] for _ in range(num_e)]
s_his_t = [[] for _ in range(num_e)]
o_his_t = [[] for _ in range(num_e)]
s_history_data = [[] for _ in range(len(train_data))]
o_history_data = [[] for _ in range(len(train_data))]
s_history_data_t = [[] for _ in range(len(train_data))]
o_history_data_t = [[] for _ in range(len(train_data))]
e = []
r = []
latest_t = 0
s_his_cache = [[] for _ in range(num_e)]
o_his_cache = [[] for _ in range(num_e)]
s_his_cache_t = [None for _ in range(num_e)]
o_his_cache_t = [None for _ in range(num_e)]


for i, train in enumerate(train_data):
    if i % 10000 == 0:
        print("train", i, len(train_data))
    # if i == 10000:
    #     break
    t = train[3]
    if latest_t != t:

        for ee in range(num_e):
            if len(s_his_cache[ee]) != 0:

                s_his[ee].append(s_his_cache[ee].copy())
                s_his_t[ee].append(s_his_cache_t[ee])
                s_his_cache[ee] = []
                s_his_cache_t[ee] = None
            if len(o_his_cache[ee]) != 0:

                o_his[ee].append(o_his_cache[ee].copy())
                o_his_t[ee].append(o_his_cache_t[ee])
                o_his_cache[ee] = []
                o_his_cache_t[ee] = None
        latest_t = t
    s = train[0]
    r = train[1]
    o = train[2]
    # print(s_his[r][s])
    s_history_data[i] = s_his[s].copy()
    o_history_data[i] = o_his[o].copy()
    s_history_data_t[i] = s_his_t[s].copy()
    o_history_data_t[i] = o_his_t[o].copy()
    # print(o_history_data_g[i])

    if len(s_his_cache[s]) == 0:
        s_his_cache[s] = np.array([[r, o]])
    else:
        s_his_cache[s] = np.concatenate((s_his_cache[s], [[r, o]]), axis=0)
    s_his_cache_t[s] = t

    if len(o_his_cache[o]) == 0:
        o_his_cache[o] = np.array([[r, s]])
    else:
        o_his_cache[o] = np.concatenate((o_his_cache[o], [[r, s]]), axis=0)
    o_his_cache_t[o] = t

    # print(s_history_data[i], s_history_data_g[i])
    # with open('ttt.txt', 'wb') as fp:
    #     pickle.dump(s_history_data_g, fp)
    # print("save")

with open('train_history_sub.txt', 'wb') as fp:
    pickle.dump([s_history_data, s_history_data_t], fp)
with open('train_history_ob.txt', 'wb') as fp:
    pickle.dump([o_history_data, o_history_data_t], fp)

s_label_train = get_history_target(train_data, s_history_data, o_history_data, 's', 'label')
with open('train_s_label.txt', 'wb') as fp:
    pickle.dump(s_label_train, fp)
del s_label_train
gc.collect()

s_history_related_train = get_history_target(train_data, s_history_data, o_history_data, 's', 'related')
torch.save(s_history_related_train, 'train_s_frequency.txt')
del s_history_related_train
gc.collect()

o_label_train = get_history_target(train_data, s_history_data, o_history_data, 'o', 'label')
with open('train_o_label.txt', 'wb') as fp:
    pickle.dump(o_label_train, fp)
del o_label_train
gc.collect()

o_history_related_train = get_history_target(train_data, s_history_data, o_history_data, 'o', 'related')
torch.save(o_history_related_train, 'train_o_frequency.txt')
del o_history_related_train
del train_data
del s_history_data
del o_history_data
gc.collect()


# print(s_history_data[0])
s_history_data_dev = [[] for _ in range(len(dev_data))]
o_history_data_dev = [[] for _ in range(len(dev_data))]
s_history_data_dev_t = [[] for _ in range(len(dev_data))]
o_history_data_dev_t = [[] for _ in range(len(dev_data))]

for i, dev in enumerate(dev_data):
    if i % 10000 == 0:
        print("valid", i, len(dev_data))
    t = dev[3]
    if latest_t != t:
        for ee in range(num_e):
            if len(s_his_cache[ee]) != 0:
                s_his_t[ee].append(s_his_cache_t[ee])
                s_his[ee].append(s_his_cache[ee].copy())
                s_his_cache[ee] = []
                s_his_cache_t[ee] = None
            if len(o_his_cache[ee]) != 0:

                o_his_t[ee].append(o_his_cache_t[ee])
                o_his[ee].append(o_his_cache[ee].copy())

                o_his_cache[ee] = []
                o_his_cache_t[ee] = None
        latest_t = t
    s = dev[0]
    r = dev[1]
    o = dev[2]
    s_history_data_dev[i] = s_his[s].copy()
    o_history_data_dev[i] = o_his[o].copy()
    s_history_data_dev_t[i] = s_his_t[s].copy()
    o_history_data_dev_t[i] = o_his_t[o].copy()
    if len(s_his_cache[s]) == 0:
        s_his_cache[s] = np.array([[r, o]])
    else:
        s_his_cache[s] = np.concatenate((s_his_cache[s], [[r, o]]), axis=0)
    s_his_cache_t[s] = t

    if len(o_his_cache[o]) == 0:
        o_his_cache[o] = np.array([[r, s]])
    else:
        o_his_cache[o] = np.concatenate((o_his_cache[o], [[r, s]]), axis=0)
    o_his_cache_t[o] = t

    # print(o_his_cache[o])

with open('dev_history_sub.txt', 'wb') as fp:
    pickle.dump([s_history_data_dev, s_history_data_dev_t], fp)
with open('dev_history_ob.txt', 'wb') as fp:
    pickle.dump([o_history_data_dev, o_history_data_dev_t], fp)

s_label_dev, s_history_related_dev = get_history_target(dev_data, s_history_data_dev, o_history_data_dev, 's')
with open('dev_s_label.txt', 'wb') as fp:
    pickle.dump(s_label_dev, fp)
with open('dev_s_frequency.txt', 'wb') as fp:
    pickle.dump(s_history_related_dev, fp)
del s_label_dev
del s_history_related_dev
gc.collect()


o_label_dev, o_history_related_dev = get_history_target(dev_data, s_history_data_dev, o_history_data_dev, 'o')
with open('dev_o_label.txt', 'wb') as fp:
    pickle.dump(o_label_dev, fp)
with open('dev_o_frequency.txt', 'wb') as fp:
    pickle.dump(o_history_related_dev, fp)
del o_label_dev
del o_history_related_dev
gc.collect()

s_history_data_test = [[] for _ in range(len(test_data))]
o_history_data_test = [[] for _ in range(len(test_data))]

s_history_data_test_t = [[] for _ in range(len(test_data))]
o_history_data_test_t = [[] for _ in range(len(test_data))]

for i, test in enumerate(test_data):
    if i % 10000 == 0:
        print("test", i, len(test_data))
    t = test[3]
    if latest_t != t:
        for ee in range(num_e):
            if len(s_his_cache[ee]) != 0:
                s_his_t[ee].append(s_his_cache_t[ee])

                s_his[ee].append(s_his_cache[ee].copy())
                s_his_cache[ee] = []
                s_his_cache_t[ee] = None
            if len(o_his_cache[ee]) != 0:

                o_his_t[ee].append(o_his_cache_t[ee])

                o_his[ee].append(o_his_cache[ee].copy())
                o_his_cache[ee] = []
                o_his_cache_t[ee] = None
        latest_t = t
    s = test[0]
    r = test[1]
    o = test[2]
    s_history_data_test[i] = s_his[s].copy()
    o_history_data_test[i] = o_his[o].copy()
    s_history_data_test_t[i] = s_his_t[s].copy()
    o_history_data_test_t[i] = o_his_t[o].copy()
    if len(s_his_cache[s]) == 0:
        # s_his_cache[s] = np.array([[r, o]])
        pass
    else:
        pass
        # s_his_cache[s] = np.concatenate((s_his_cache[s], [[r, o]]), axis=0)
    s_his_cache_t[s] = t

    if len(o_his_cache[o]) == 0:
        pass
        # o_his_cache[o] = np.array([[r, s]])
    else:
        pass
        # o_his_cache[o] = np.concatenate((o_his_cache[o], [[r, s]]), axis=0)
    o_his_cache_t[o] = t
    # print(o_his_cache[o])


with open('test_history_sub.txt', 'wb') as fp:
    pickle.dump([s_history_data_test, s_history_data_test_t], fp)
with open('test_history_ob.txt', 'wb') as fp:
    pickle.dump([o_history_data_test, o_history_data_test_t], fp)

s_label_test, s_history_related_test = get_history_target(test_data, s_history_data_test, o_history_data_test, 's')
with open('test_s_label.txt', 'wb') as fp:
    pickle.dump(s_label_test, fp)
with open('test_s_frequency.txt', 'wb') as fp:
    pickle.dump(s_history_related_test, fp)
del s_label_test
del s_history_related_test
gc.collect()


o_label_test, o_history_related_test = get_history_target(test_data, s_history_data_test, o_history_data_test, 'o')
with open('test_o_label.txt', 'wb') as fp:
    pickle.dump(o_label_test, fp)
with open('test_o_frequency.txt', 'wb') as fp:
    pickle.dump(o_history_related_test, fp)
del o_label_test
del o_history_related_test
gc.collect()

from cenet.

Related Issues (16)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.