Skip to content
Snippets Groups Projects
Commit 69cc8f2f authored by Thomas Nägele's avatar Thomas Nägele :speech_balloon:
Browse files

changed data structure

parent 6bbd0de6
No related branches found
No related tags found
No related merge requests found
Pipeline #2607 passed
Showing
with 883 additions and 0 deletions
assays/Data/ExpressionBrowser_TE_trimmed.csv filter=lfs diff=lfs merge=lfs -text
assays/Data/Proteomics_data.csv filter=lfs diff=lfs merge=lfs -text
assays/Data/Proteomics_data.parquet filter=lfs diff=lfs merge=lfs -text
assays/Data/Proteomics_data_2.csv filter=lfs diff=lfs merge=lfs -text
assays/Data/transcript_data.parquet filter=lfs diff=lfs merge=lfs -text
assays/Data/transcripts_GE.csv filter=lfs diff=lfs merge=lfs -text
assays/Data/transcripts_GE.xlsx filter=lfs diff=lfs merge=lfs -text
assays/Data/transcripts_GE_2.csv filter=lfs diff=lfs merge=lfs -text
assays/Test/Test_API.ipynb filter=lfs diff=lfs merge=lfs -text
Data/transcript_data.csv
Data/Omics_db.db
\ No newline at end of file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as Patch
from matplotlib.lines import Line2D
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, SparsePCA
class AutoPCA():
"""
creates an pca object and provides a funciton for plottig a biplot
Inputs:
- ``data``: ``DataFrame`` of data supplies. Should be numeric and contain the sample name in one column
- ``target``: ``str`` of the column containing the sample name
- ``sparse``: ``bool`` of default ``False`` to use sprase PCA
- ``alpha`` : ``float`` for alpha value of sparse PCA.
Methodes:
- ``biplot``: creates a biplot on supplies axes. If no axis supplied, a new one is created
Notes:
- When using a sparsePCA for the analysis the explained varience of each component becoms meaning less as
the components are not orthogonal anymore. Owing to this, the explaind variance would overlapp. Thus,
only the total explained variance is worth reporting, suggesting how good the overall model is with
two components (as is used when generating a PCA() or sparsePCA() object). (trace(P @ T.T @ T @ P.T) is
variance of model). Additinaly the loadings are not the same as the rotation, as loadings are not
orthogonal as such the Rotations R = P @ (P.T @ P)^-1
"""
def __init__(self, data: pd.DataFrame, target: str, scale = True, sparse = False, alpha = 1):
self.data = data
self.target = target
self.sparse = sparse
self.scale = scale
self.X = self.data.copy() \
.drop(self.target, axis=1)
self.y = self.data.copy() \
[self.target]
self.X = self.X.dropna(axis=1)
if self.scale == True:
standard_scaler = StandardScaler()
X_scaled = standard_scaler.fit_transform(self.X)
else:
X_scaled = self.X
if sparse == False:
pca = PCA(n_components=2).fit(X_scaled)
else:
pca = SparsePCA(n_components=2, random_state=112, alpha=alpha).fit(X_scaled)
self.X_reduced = pca.transform(X_scaled)
self.scores = self.X_reduced[:, :2]
self.loadings = pca.components_[:2].T
total_variance = np.trace(X_scaled.T @ X_scaled)
explained_variance = np.trace(self.X_reduced @ self.loadings.T @ self.loadings @ self.X_reduced.T)
self.explained_variance_ratio = explained_variance/total_variance
if sparse == False:
self.pvars = pca.explained_variance_ratio_[:2] * 100
def return_loadings(self):
loadings = pd.DataFrame(self.loadings.copy(), columns = ['Comp1', 'Comp2'])
loadings['variable'] = self.X.columns
return loadings
def biplot(self, axs = None):
arrow = self.loadings * np.abs(self.scores).max(axis=0)
if axs is None:
fig, axs = plt.subplots(figsize=(8,6))
plt.sca(axs)
cols = self.y.iloc[:,0].drop_duplicates().to_list()
color_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
colors = {color : color_list[i] for i, color in enumerate(cols)}
shps = self.y.iloc[:,1].drop_duplicates().to_list()
shape_list = ['o', '^', 's', 'p', 'D', 'v', '<', '>']
shapes = {shape : shape_list[i] for i, shape in enumerate(shps)}
for name in self.y.iloc[:,0].drop_duplicates().to_list():
for shape in self.y.iloc[:,1].drop_duplicates().to_list():
axs.scatter(*zip(*self.scores[(self.y.iloc[:,0] == name) & (self.y.iloc[:,1] == shape)]), label=None, marker = shapes[shape], color = colors[name])
color_handles = [Line2D([0], [0], marker = 's', markersize=8, linestyle='None', label = cols[i], color=color) for i, color in enumerate(colors.values())]
color_legend = axs.legend(handles = color_handles, loc = 'lower center', ncols = len(cols), bbox_to_anchor=(0.5, -0.13), frameon=False)
axs.legend(labels=shapes.keys(), loc = 'lower center', ncols = len(shps), bbox_to_anchor=(0.5, -0.2), frameon=False)
axs.add_artist(color_legend)
width = -0.003 * np.min([np.subtract(*plt.xlim()), np.subtract(*plt.ylim())])
for i, arrow in enumerate(arrow):
axs.arrow(0, 0, *arrow, color='k', alpha=0.5, width = width, ec='none',
length_includes_head=True)
axs.text(*(arrow * 1.05), self.X.columns[i],
ha='center', va='center', fontsize = 7)
for i, axis in enumerate('xy'):
getattr(plt, f'{axis}ticks')([])
if self.sparse == False:
getattr(plt, f'{axis}label')(f'PC{i + 1} ({self.pvars[i]:.2f}%)')
else:
getattr(plt, f'{axis}label')(f'PC{i + 1}')
return axs
def scatter_plot(self, axs = None):
if axs is None:
fig, axs = plt.subplots(figsize=(8,6))
plt.sca(axs)
cols = self.y.iloc[:,0].drop_duplicates().to_list()
color_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
colors = {color : color_list[i] for i, color in enumerate(cols)}
shps = self.y.iloc[:,1].drop_duplicates().to_list()
shape_list = ['o', '^', 's', 'p', 'D', 'v', '<', '>']
shapes = {shape : shape_list[i] for i, shape in enumerate(shps)}
for name in self.y.iloc[:,0].drop_duplicates().to_list():
for shape in self.y.iloc[:,1].drop_duplicates().to_list():
axs.scatter(*zip(*self.scores[(self.y.iloc[:,0] == name) & (self.y.iloc[:,1] == shape)]), label=None, marker = shapes[shape], color = colors[name])
color_handles = [Line2D([0], [0], marker = 's', markersize=8, linestyle='None', label = cols[i], color=color) for i, color in enumerate(colors.values())]
color_legend = axs.legend(handles = color_handles, loc = 'lower center', ncols = len(cols), bbox_to_anchor=(0.5, -0.13), frameon=False)
axs.legend(labels=shapes.keys(), loc = 'lower center', ncols = len(shps), bbox_to_anchor=(0.5, -0.2), frameon=False)
axs.add_artist(color_legend)
for i, axis in enumerate('xy'):
getattr(plt, f'{axis}ticks')([])
if self.sparse == False:
getattr(plt, f'{axis}label')(f'PC{i + 1} ({self.pvars[i]:.2f}%)')
else:
getattr(plt, f'{axis}label')(f'PC{i + 1}')
return axs
\ No newline at end of file
import pandas as pd
import numpy as np
from typing import Literal
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.patches as mpatches
from sklearn.cross_decomposition import PLSCanonical
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects import numpy2ri
import rpy2.robjects as ro
pandas2ri.activate()
numpy2ri.activate()
class AutoPLS():
"""
Alows to preform and plot a PLS using sklean's PLSCononical (for symetrical design)
Input:
- ``data_x``: ``DataFrame`` of numeric data with index being the sample tag
- ``data_y``: ``DataFrame`` of numeric data with index being the sample tag
Note:
- ``socres``: T
- ``weights``: W used in the calculation of the PLS
- ``loadings``: P calculated after PLS so that X = T @ P.T
- ``rotaions``: R calculated by R = W @ (P:T @ W)^-1 and results in T = X @ R
"""
def __init__(self, data_x: pd.DataFrame, data_y: pd.DataFrame, methode: Literal['sklearn', 'mixOmcis'] = 'sklearn', keepX: list = [5, 5], keepY: list = [5,5],
use: Literal['Loadings', 'Rotations']= 'Loadings') -> None:
self.data_x = data_x
self.data_x = self.data_x.dropna(axis=1)
self.data_y = data_y
self.data_y =self.data_y.dropna(axis=1)
self.use = use
if methode == 'sklearn':
self.pls = PLSCanonical(n_components=2)
self.pls.fit(self.data_x, self.data_y)
x_scores, y_scores = self.pls.transform(self.data_x, self.data_y)
self.x_scores = x_scores
self.y_scores = y_scores
self.x_loadings_ = self.pls.x_loadings_
self.y_loadings_ = self.pls.y_loadings_
self.x_rotations_ = self.pls.x_rotations_
self.y_rotations_ = self.pls.y_rotations_
raw_x = np.asarray(self.data_x.copy())
std = raw_x.std(axis=0)
raw_x -= raw_x.mean(axis=0)
raw_x /= std
raw_y = np.asarray(self.data_y.copy())
std = raw_y.std(axis=0)
raw_y -= raw_y.mean(axis=0)
raw_y /= std
elif methode == 'mixOmcis':
mixOmics = importr('mixOmics')
with (ro.default_converter + pandas2ri.converter).context():
X = self.data_x.copy()
Y = self.data_y.copy()
Xr = ro.conversion.get_conversion().py2rpy(X)
Yr = ro.conversion.get_conversion().py2rpy(Y)
pls = mixOmics.spls(Xr, Yr, ncomp = 2, mode = 'canonical', keepX = ro.vectors.IntVector(keepX), keepY = ro.vectors.IntVector(keepY))
self.x_loadings_ = pls['loadings']['X']
self.y_loadings_ = pls['loadings']['Y']
self.x_rotations_ = list(pls['loadings.star'].items())[0][1]
self.y_rotations_ = list(pls['loadings.star'].items())[1][1]
self.x_scores = pls['variates']['X']
self.y_scores = pls['variates']['Y']
raw_x = pls['X']
raw_y = pls['Y']
x_rebuild = self.x_scores @ self.x_loadings_.T
self.captured_variance_x = np.trace(x_rebuild.T @ x_rebuild)/np.trace(raw_x.T @ raw_x)
y_rebuild = self.y_scores @ self.y_loadings_.T
self.captured_variance_y = np.trace(y_rebuild.T @ y_rebuild)/np.trace(raw_y.T @ raw_y)
def return_loadings(self):
x_loadings = pd.DataFrame(self.x_loadings_.copy(), columns = ['Comp1', 'Comp2'])
x_loadings['variable'] = self.data_x.columns
y_loadings = pd.DataFrame(self.y_loadings_.copy(), columns = ['Comp1', 'Comp2'])
y_loadings['variable'] = self.data_y.columns
return x_loadings, y_loadings
def arrow_plot(self, axs, set_name: list):
if axs is None:
fig, axs = plt.subplots(figsize=(8,6))
plt.sca(axs)
res_x = pd.DataFrame(self.x_scores, columns=['Comp1', 'Comp2'])
res_x['tag'] = self.data_x.index
res_x[['genotype', 'condition']] = res_x['tag'].str.split('_', expand=True)
res_x = res_x.drop('tag', axis=1)
res_y = pd.DataFrame(self.y_scores, columns=['Comp1', 'Comp2'])
res_y['tag'] = self.data_y.index
res_y[['genotype', 'condition']] = res_y['tag'].str.split('_', expand=True)
res_y = res_y.drop('tag', axis=1)
cols = res_x['genotype'].drop_duplicates().to_list()
color_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
colors = {color : color_list[i] for i, color in enumerate(cols)}
shps = res_x['condition'].drop_duplicates().to_list()
shape_list = ['o', '^', 's', 'p', 'D', 'v', '<', '>']
shapes = {shape : shape_list[i] for i, shape in enumerate(shps)}
set_color = ['black', 'gray']
width = -0.005 * np.min([np.subtract(*plt.xlim()), np.subtract(*plt.ylim())])
for col in cols:
for shp in shps:
x_arrow = res_x.loc[(res_x['genotype'] == col) & (res_x['condition'] == shp)]['Comp1'].iloc[0]
y_arrow = res_x.loc[(res_x['genotype'] == col) & (res_x['condition'] == shp)]['Comp2'].iloc[0]
xd_arrow = (res_y.loc[(res_y['genotype'] == col) & (res_y['condition'] == shp)]['Comp1'].iloc[0] - x_arrow) * 0.94
yd_arrow = (res_y.loc[(res_y['genotype'] == col) & (res_y['condition'] == shp)]['Comp2'].iloc[0] - y_arrow) * 0.94
axs.arrow(x_arrow, y_arrow, xd_arrow, yd_arrow, width = width, ec='none', length_includes_head=True, color = colors[col])
axs.scatter(res_x.loc[(res_x['genotype'] == col) & (res_x['condition'] == shp)]['Comp1'],
res_x.loc[(res_x['genotype'] == col) & (res_x['condition'] == shp)]['Comp2'],
marker = shapes[shp], color = colors[col], edgecolors='black')
axs.scatter(res_y.loc[(res_y['genotype'] == col) & (res_y['condition'] == shp)]['Comp1'],
res_y.loc[(res_y['genotype'] == col) & (res_y['condition'] == shp)]['Comp2'],
marker = shapes[shp], color = colors[col], edgecolors='gray')
for i, axis in enumerate('xy'):
getattr(plt, f'{axis}ticks')([])
getattr(plt, f'{axis}label')(f'Comp {i + 1}')
color_handles = [Line2D([0], [0], marker = 's', markersize=8, linestyle='None', label = cols[i], color=color) for i, color in enumerate(colors.values())]
shape_handles = [Line2D([0], [0], marker = shape, markersize=8, linestyle='None', label = shps[i]) for i, shape in enumerate(shapes.values())]
set_handles = [Line2D([0], [0], marker = 's', markersize=8, linestyle='None', label = name, color = 'white', markeredgecolor=set_color[i]) for i, name in enumerate(set_name)]
color_legend = axs.legend(handles = color_handles, loc = 'lower left', ncols = len(cols), bbox_to_anchor=(0.25, -0.13), frameon=False, prop={'size': 9})
set_legend = axs.legend(handles = set_handles, loc = 'lower left', ncols = 1, bbox_to_anchor = (-0.035,-0.17), frameon=False, prop={'size': 9})
axs.legend(handles=shape_handles, loc = 'lower left', ncols = len(shps), bbox_to_anchor=(0.25, -0.17), frameon=False, prop={'size': 9})
axs.add_artist(color_legend)
axs.add_artist(set_legend)
return axs
def loadings_plot(self, axs, set_name: list):
if axs is None:
fig, axs = plt.subplots(figsize=(8,6))
plt.sca(axs)
if self.use == 'Loadings':
x_loadings = pd.DataFrame(self.x_loadings_.copy(), columns=['Comp1', 'Comp2'])
y_loadings = pd.DataFrame(self.y_loadings_.copy(), columns=['Comp1', 'Comp2'])
elif self.use == 'Rotations':
x_loadings = pd.DataFrame(self.x_rotations_.copy(), columns=['Comp1', 'Comp2'])
y_loadings = pd.DataFrame(self.y_rotations_.copy(), columns=['Comp1', 'Comp2'])
x_loadings['variable'] = self.data_x.columns.tolist()
y_loadings['variable'] = self.data_y.columns.tolist()
width = -0.005 * np.min([np.subtract(*plt.xlim()), np.subtract(*plt.ylim())])
arrow_scaler = np.concatenate((np.expand_dims(np.abs(self.x_scores).max(axis=0),0),
np.expand_dims(np.abs(self.y_scores).max(axis=0),0)),
axis=0) \
.max(axis=0)
for i in x_loadings['variable']:
x_d = x_loadings.loc[x_loadings['variable'] == i]['Comp1'].iloc[0]
y_d = x_loadings.loc[x_loadings['variable'] == i]['Comp2'].iloc[0]
arrow = [x_d, y_d] * arrow_scaler
plt.arrow(0,0,*arrow, color = '#1f77b4', ec = 'none', width = width)
plt.text(*(arrow*1.05), i, ha='center', va='center', fontsize = 7)
for i in y_loadings['variable']:
x_d = y_loadings.loc[y_loadings['variable'] == i]['Comp1'].iloc[0]
y_d = y_loadings.loc[y_loadings['variable'] == i]['Comp2'].iloc[0]
arrow = [x_d, y_d] * arrow_scaler
plt.arrow(0,0,*arrow, color = '#ff7f0e', ec = 'none', width = width)
plt.text(*(arrow*1.05), i, ha='center', va='center', fontsize = 7)
x_patch = mpatches.Patch(color='#1f77b4', label=f'{set_name[0]}')
y_patch = mpatches.Patch(color='#ff7f0e', label=f'{set_name[1]}')
plt.legend(handles=[x_patch, y_patch], loc = 'lower center', ncols = 2,
bbox_to_anchor=(0.5, -0.13), frameon=False)
for i, axis in enumerate('xy'):
getattr(plt, f'{axis}ticks')([])
getattr(plt, f'{axis}label')(f'Comp {i + 1}')
\ No newline at end of file
This diff is collapsed.
__all__ = ['Omics_API', 'get_omics_data', 'dynamics_by_anova', 'AutoPCA']
\ No newline at end of file
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment