Hi <a class="user-mention notranslate" data-hovercard-type="user" data-hovercard-url="

Question about the Fig.12 about equidock_public HOT 4 OPEN

octavian-ganea commented on May 24, 2024

Question about the Fig.12

from equidock_public.

Comments (4)

octavian-ganea commented on May 24, 2024

Hi,

Yes, you are right, the DB5.5 data contains many inconsistencies between the bound and unbound structures for the same protein, not sure exactly why. To generate that plot, we just did a dynamic programming implementation of the longest common subsequence problem to align the two sequences, and then only use those residues in the RMSD computations. This is not a perfect solution, but not sure what can be done to alleviate this data issue ...

from equidock_public.

bbjy commented on May 24, 2024

Thank you for your reply!
I also try to do that as you said with the Biopython-> pairwise2.align.globalxx() function to align the bound and unbound sequences. And I get the result with complex_rmsd_median: 1.455, complex_rmsd_mean: 2.1065, complex_rmsd_std: 2.5771. I seems similar to your published result, however, the max complex rmsd I calculated is 29.3533, which is greater than 35 in Figure 12. So I am not sure that whether there is any mistake in my code. It will be so great if you could share the code of how to get the rmsd between bound and unbound structures. Thanks so much!

from equidock_public.

octavian-ganea commented on May 24, 2024

Yep, here is the code (also see the full list of RMSD computed below).


import os

import numpy as np
from src.utils.protein_utils import rigid_transform_Kabsch_3D
from src.utils.db5_data import get_residues_db5
import scipy.spatial as spa




# Dynamic programming implementation of LCS problem
# Returns length of LCS for X[0..m-1], Y[0..n-1]
def LCSubSeq(X, Y):
    m = len(X)
    n = len(Y)
    L = [[0 for _ in range(n + 1)] for _ in range(m + 1)]

    # Following steps build L[m+1][n+1] in bottom up fashion. Note
    # that L[i][j] contains length of LCS of X[0..i-1] and Y[0..j-1]
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                L[i][j] = 0
            elif X[i - 1][1]['resname'].iloc[0] == Y[j - 1][1]['resname'].iloc[0]:
                L[i][j] = L[i - 1][j - 1] + 1
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])

    # Following code is used to print LCS
    index = L[m][n]

    # Create a character array to store the lcs string
    # lcs = [""] * (index + 1)
    # lcs[index] = ""
    lcs_X = [""] * (index)
    lcs_Y = [""] * (index)

    # Start from the right-most-bottom-most corner and one by one store characters in lcs[]
    i = m
    j = n
    while i > 0 and j > 0:
        # If current character in X[] and Y are same, then current character is part of LCS
        if X[i - 1][1]['resname'].iloc[0] == Y[j - 1][1]['resname'].iloc[0]:
            lcs_X[index - 1] = X[i - 1]
            lcs_Y[index - 1] = Y[j - 1]
            i -= 1
            j -= 1
            index -= 1

        # If not same, then find the largest of the two and go in the direction of larger value
        elif L[i - 1][j] > L[i][j - 1]:
            i -= 1
        else:
            j -= 1
    return lcs_X, lcs_Y



def filter_residues(residues):
    residues_filtered = []
    for residue in residues:
        df = residue[1]
        Natom = df[df['atom_name'] == 'N']
        alphaCatom = df[df['atom_name'] == 'CA']
        Catom = df[df['atom_name'] == 'C']

        if Natom.shape[0] == 1 and alphaCatom.shape[0] == 1 and Catom.shape[0] == 1:
            residues_filtered.append(residue)
    return residues_filtered


###################
def get_alphaC_loc_array(bound_predic_clean_list):
    bound_alphaC_loc_clean_list = []
    for residue in bound_predic_clean_list:
        df = residue[1]
        alphaCatom = df[df['atom_name'] == 'CA']
        alphaC_loc = alphaCatom[['x', 'y', 'z']].to_numpy().squeeze().astype(np.float32)
        assert alphaC_loc.shape == (3,), \
            f"alphac loc shape problem, shape: {alphaC_loc.shape} residue {df} resid {df['residue']}"
        bound_alphaC_loc_clean_list.append(alphaC_loc)
    if len(bound_alphaC_loc_clean_list) <= 1:
        bound_alphaC_loc_clean_list.append(np.zeros(3))
    return np.stack(bound_alphaC_loc_clean_list, axis=0)  # (N_res,3)


def get_alphaC_list(bound_predic_clean_list):
    lll = []
    for residue in bound_predic_clean_list:
        df = residue[1]
        alphaCatom = df[df['atom_name'] == 'CA']['atom_number'].iloc[0]
        lll.append(alphaCatom)
    return lll


def get_CA_coords(b_file, u_file):
    b_residues = filter_residues(get_residues_db5(b_file))
    u_residues = filter_residues(get_residues_db5(u_file))
    before = len(b_residues)
    b_residues, u_residues = LCSubSeq(b_residues, u_residues)
    # print('after b_res', get_alphaC_list(b_residues))
    # print('after u_res', get_alphaC_list(u_residues))
    # print(functools.reduce(lambda x, y : x and y, map(lambda p, q: p == q, get_alphaC_list(b_residues),get_alphaC_list(u_residues)), True))

    b_residues = get_alphaC_loc_array(b_residues)
    u_residues = get_alphaC_loc_array(u_residues)
    after = b_residues.shape[0]
    if before == after == u_residues.shape[0]:
        print(b_file, u_file)
    print(before, after, u_residues.shape[0])
    return b_residues, u_residues


def plot_all():
    input_dir = './data/benchmark5.5/structures'
    pdb_files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f)) and f.endswith('.pdb')]

    all_rmsd = []

    num_test_files = 0
    for file in pdb_files:
        if file.endswith('_l_u.pdb'):
            ll = len('_l_u.pdb')
            bound_file = os.path.join(input_dir, file[:-ll] + '_l_b.pdb')
            unbound_file = os.path.join(input_dir, file[:-ll] + '_l_u.pdb')
        elif file.endswith('_r_u.pdb'):
            ll = len('_r_u.pdb')
            bound_file = os.path.join(input_dir, file[:-ll] + '_r_b.pdb')
            unbound_file = os.path.join(input_dir, file[:-ll] + '_r_u.pdb')
        else:
            continue

        b_coords, u_coords = get_CA_coords(bound_file, unbound_file)

        dmat = spa.distance.cdist(b_coords, b_coords)
        dmat = dmat + 100. * np.eye(dmat.shape[0])
        # print('mean d = ', np.min(dmat, axis=1).mean())
        # print('median d = ', np.median(np.min(dmat, axis=1)))

        if b_coords.shape[0] != u_coords.shape[0]:
            print(file)
            continue
        num_test_files += 1
        R, b = rigid_transform_Kabsch_3D(b_coords.T, u_coords.T)
        b_coords_aligned = ((R @ b_coords.T) + b).T
        rmsd = np.sqrt(np.mean(np.sum((b_coords_aligned - u_coords) ** 2, axis=1)))

        all_rmsd.append(rmsd)
        # print('num_test_files done ', num_test_files)


    print('all_rmsd', all_rmsd)
    print(len(all_rmsd))
    return all_rmsd


all_rmsd = plot_all()

# all_rmsd = [0.6962418, 27.03626981434666, 2.9686131, 3.5345614, 1.7044786, 1.319995, 5.1798444, 0.5049575, 0.45675388, 2.6630943, 2.546523, 1.2128848, 0.59461874, 1.1774287, 0.8224659, 1.5111822, 1.1599942, 1.7826275, 0.65784246, 0.5582014, 2.8252356, 1.9506255, 1.3800596, 2.7822397, 2.0161128, 1.4857557, 0.80386686, 1.2378623, 2.7285552, 19.54151, 1.5791698, 0.77282906, 0.32637906, 0.4722987, 1.2494422, 0.7638261, 1.3630565, 0.5317533, 1.9764836e-06, 0.55156565, 2.7910872, 0.9011328, 1.1561608, 2.5926828, 3.8203578, 0.70295584, 1.1666499, 0.6264738, 0.6232077, 1.1883411, 0.56530625, 1.2028227, 1.3477685, 1.8282441, 1.5747728, 2.128397, 1.7903032, 4.280821, 1.4810741, 2.0309215, 1.7389334, 1.3359979, 0.9335407, 1.2722429, 1.0772936, 1.4273612, 0.90363824, 1.1851745, 0.65965843, 0.6105662, 0.88963103, 0.7989469, 3.0286636, 1.4925646, 1.0704238, 1.2667646, 0.9554937, 2.1890259, 0.4463879, 3.0691504, 12.041163, 1.1361443, 1.6985788, 1.8809482, 0.70238364, 2.1219745, 2.4284933, 1.1817397, 0.618453, 3.5171118, 1.7590178, 0.57367647, 0.8153385, 0.5950234, 1.4212565, 0.8913942, 4.423849, 0.6001566, 3.0566185, 0.45846713, 0.3916329, 3.1001763, 0.84036165, 1.2455623, 0.94656336, 0.43186194, 0.70226103, 2.1483116, 0.65598303, 0.49707252, 9.990827, 1.2918882, 0.77361584, 1.4257747, 0.80301446, 1.5146624, 1.3873624, 0.9554937, 1.0153382, 1.3809197, 0.9295433, 0.4461008, 0.3887217, 2.71268, 0.6058705, 2.5367131, 1.5577396, 2.7555687, 1.5684363, 0.5662212, 1.8282441, 0.34369707, 3.5691407, 29.304473532867767, 0.8604853, 3.8777807, 0.3287732, 1.192507, 0.53047264, 1.1643264, 1.8756974, 4.397976, 1.9317511, 0.8851453, 4.280821, 1.6856755, 2.1955128, 1.0414593, 3.8239803, 0.6302643, 1.2624161, 2.8827038, 1.0478029, 3.4473317, 0.770314, 0.64568347, 1.4387469, 1.6138345, 1.7465917, 1.962105, 1.0688684, 1.5654682, 0.43968844, 0.7649709, 1.7110707, 1.9716086, 1.4674189, 1.9981797, 0.8618321, 0.6721722, 2.071398, 0.51870245, 2.982918, 0.8164904, 3.4028974, 1.0031127, 0.6929141, 1.1985035e-06, 22.282746453028356, 1.1930548, 0.49166816, 4.2538457, 0.889148, 1.6589879, 0.7498187, 0.4058273, 0.5056364, 0.62057555, 4.0950875, 1.1783762, 1.0915266, 9.825888, 0.6582492, 1.2743578, 0.4995857, 10.788652, 11.905804, 0.8787025, 2.2948368e-06, 3.3075848, 0.6850603, 1.7519094, 1.9719495, 0.44748992, 0.55696625, 0.6096156, 0.64054054, 3.0099761, 2.3606842, 1.6786654, 1.1942681, 0.6978604, 0.43543476, 7.2658124, 0.8521949, 1.4701211, 1.7214085, 0.5289863, 1.1149201, 3.729744, 1.7212167, 5.581814, 3.9925954, 0.9347552, 2.0152748, 5.539305, 1.3955237, 1.253993, 18.731201, 0.5405805, 1.1983395, 1.3311571, 1.0794693, 2.2760947, 38.5945, 2.7223735, 1.6750678, 0.51923317, 1.7674416, 2.7703874, 3.936683e-06, 1.4250864, 1.7576742, 0.66976196, 2.1966534e-06, 3.8855648, 0.572205, 1.06187, 0.4635174, 0.5988311, 0.46330267, 1.9889144, 1.5581557, 3.1054556, 1.0541656, 1.6353214e-06, 1.032354, 0.665778, 1.087051, 0.6215222, 0.60802054, 6.606358, 0.5751045, 0.4689752, 0.37107491, 1.067861, 4.729647, 4.178835, 1.2696042, 0.74371666, 4.3974447, 3.731839, 1.1009775, 6.2470875, 2.1379695, 2.7581146, 3.2439315, 0.7497935, 4.4688506, 0.74708706, 1.2113068, 3.0039606, 1.6956894, 0.7109091, 1.6787137, 32.79343038081955, 7.0521705e-06, 1.1089412, 0.86929286, 2.2734303, 1.9720659, 1.9889144, 0.5684412, 0.72357994, 1.0572548, 1.2094994, 0.87182486, 0.6639882, 0.6363185, 0.68100965, 1.0455867, 2.7395332, 7.231181, 0.629642, 7.0089383, 2.757306, 1.0041324, 1.650062, 2.9461417, 2.0531807, 0.3912766, 0.6195286, 2.1035693, 0.93100214, 1.7368435, 2.107763, 1.3961453, 1.3059765, 1.1841563, 7.2915254, 0.8731526, 2.5055175, 1.0232973, 2.1887197, 1.6135501, 1.7934625, 2.218598, 0.6318677, 1.1169442, 1.41212, 1.8013055, 0.83939904, 0.5386748, 1.3009996, 1.0179409, 1.5744836, 1.7691509, 1.8033211, 4.0990553, 0.40228027, 1.5470148, 2.229582, 1.0745113, 1.2127255, 0.64667904, 3.870754, 0.6516309, 0.5308961, 0.6830703, 0.37585256, 3.6248713, 1.0849175, 1.1546352, 0.35467917, 0.8584139, 1.713536, 0.9835134, 4.863827, 1.1013104, 1.5991068, 2.7803283, 0.5220013, 0.7984924, 1.1942775, 5.522912, 1.0631386, 0.45954815, 0.8350289, 1.6775175, 2.48245, 1.1269842, 0.84553206, 0.59515387, 0.7430531, 1.1843232, 0.8322676, 0.71300054, 0.8555032, 1.1336085, 1.536494, 29.936849938155948, 0.5567714, 0.50779784, 2.8568308, 0.5459214, 0.766995, 0.9020894, 1.6567634, 1.1025717, 0.7798935, 1.0375565, 0.63780326, 0.66015315, 0.5144089, 0.87448967, 2.9359785e-07, 0.51562625, 1.1334723, 1.5090711, 2.2924914, 0.9140785, 8.136824, 3.3150349, 0.3546582, 6.9085704e-06, 2.0621643, 1.1151338, 4.43456, 0.51987344, 1.9622375, 1.1580093, 0.27936888, 0.76604265, 1.1473372, 0.7945121, 1.3007371, 1.3704231, 1.1207563, 2.4275997, 1.1122599, 17.701017, 0.48094028, 3.4398391, 4.495497, 3.4427142, 1.3895931, 0.8203486, 1.0786057, 4.4397645, 3.2132168, 1.5061754e-06, 5.1517496, 4.1289624e-07, 1.9721442, 0.8347928, 9.394635, 0.35105443, 1.3121231, 1.9221034, 0.95891297, 1.2227244, 8.433344, 2.9149215, 1.9889144, 0.37265316, 2.0469155, 2.2152584, 0.4578064, 1.2578293, 2.6659403, 0.62059456, 4.739158, 8.855496, 0.48358005, 0.6706964, 0.9605975, 1.148719, 1.3905442, 3.1397026, 6.062035, 0.66542107, 1.0939509, 3.4895027, 0.7346339, 0.80699295, 3.7031268e-07, 1.1419114, 1.5155606, 0.72397625, 2.7118328, 1.6419793, 0.63384485, 0.8620122, 3.213217, 9.210633, 0.41391, 0.58243823, 0.24865517, 0.50800824, 1.123511, 1.8632666, 0.4743436, 2.338837, 2.268722, 10.291622, 0.41552648, 1.7778066, 1.0593673, 0.7203342, 1.0793355, 1.1355674, 0.2461644, 1.548148, 0.5702176, 0.5160546, 2.4865284, 1.7082264, 1.0295498, 1.2837969, 1.753832, 1.0659016, 0.7255667, 0.62942755, 2.8856826, 0.74441904, 0.46673965, 0.5662806, 1.1644644, 2.005169, 0.51830894, 1.7189244, 0.7285067, 2.3819146, 1.0490074, 0.6707141, 2.7278082, 1.0616714, 4.2380977, 0.8752755, 0.32089052, 1.5652136, 1.512635, 1.0160714, 1.4508268, 1.3173221, 0.32655928, 0.40831152, 0.5914244, 0.86104983, 2.3572385, 1.2258587, 3.525297, 10.330421, 4.2397623, 1.1214422, 0.32262784, 10.559156, 0.429923, 1.493592, 0.35212544, 0.9639773, 1.025263]


import plotly.express as px


fig = px.histogram(all_rmsd, nbins=100)
fig.add_vline(x=np.median(all_rmsd), annotation_text="median = " + "{:.1f}".format(np.median(all_rmsd)) + ' &#8491;',
              annotation_font_size=20, annotation_position="top right", line_dash = 'dash', line_color = 'firebrick', annotation_font_color= 'firebrick')
fig.add_vline(x=np.mean(all_rmsd), annotation_text="\t<br>mean = " + "{:.1f}".format(np.mean(all_rmsd)) + ' &#8491;',
              annotation_font_size=20, annotation_position="top right", line_dash = 'dash', line_color = 'green', annotation_font_color= 'green')

fig.update_xaxes(title='RMSD (&#8491;) bound - unbound structures (of the Cα atomic coordinates)', title_font = {"size": 20}, tickfont = {"size": 20})
fig.update_yaxes(title='Frequency in DB5.5', title_font = {"size": 20}, tickfont = {"size": 20})

fig.show()

from equidock_public.

bbjy commented on May 24, 2024

It is so kind of you!
Since the LCS subsquence may not be unique, it may influence the result I got. I will check my code with reference to this. Thank you so much again!

from equidock_public.

Question about the Fig.12 about equidock_public HOT 4 OPEN

Comments (4)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent