<aside>
NCBI - WWW Error Blocked Diagnostic
사용한 public data </aside>
각 분석 기법 별 필요한 input data 가공은 github의 ^preprocess.py/.ipynb 참고
adata = sc.read_h5ad('C:/Users/user/Desktop/GSE211799_adata_atlas.h5ad')
>>> print(adata.shape)
(301796, 31706)
>>> print(adata)
AnnData object with n_obs × n_vars = 301796 × 31706
obs: 'study_sample', 'study', 'file', 'reference', 'size_factors_sample', 'phase_cyclone', 's_cyclone', 'g2m_cyclone', 'g1_cyclone', 'sex', 'ins_score', 'ins_high', 'gcg_score', 'gcg_high', 'sst_score', 'sst_high', 'ppy_score', 'ppy_high', 'cell_filtering', 'age', 'strain', 'tissue', 'technique', 'study_sample_design', 'cell_type', 'cell_type_multiplet', 'cell_subtype', 'cell_subtype_multiplet', 'design', 'size_factors_integrated', 'pre_cell_type_unified', 'pre_cell_type_original', 'study_parsed', 'cell_type_parsed', 'low_q', 'BETA-DATA_leiden_r1.5', 'BETA-DATA_leiden_r20', 'BETA-DATA_hc_gene_programs', 'BETA-DATA_hc_gene_programs_parsed', 'BETA-DATA_leiden_r1.5_parsed', 'BETA-DATA_leiden_r1.5_parsed_const', 'CXG-DATA_n_genes', 'CXG-DATA_mt_frac', 'CXG-DATA_doublet_score', 'CXG-DATA_log10_n_counts', 'CXG-DATA_age_approxDays', 'CXG-DATA_cell_subtype_immune_reannotatedIntegrated', 'CXG-DATA_cell_subtype_endothelial_reannotatedIntegrated', 'CXG-DATA_emptyDrops_LogProb_scaled', 'CXG-DATA_diabetes_model', 'CXG-DATA_chemical_stress', 'CXG-DATA_GEO_accession', 'CXG-DATA_sex_annotation', 'cell_type_integrated_v2', 'cell_type_integrated_v2_parsed'
var: 'gene_symbol', 'used_integration', 'gene_symbol_original_matched', 'CXG-DATA_feature_is_filtered', 'CXG-DATA_present_Fltp_2y', 'CXG-DATA_present_Fltp_adult', 'CXG-DATA_present_Fltp_P16', 'CXG-DATA_present_NOD', 'CXG-DATA_present_NOD_elimination', 'CXG-DATA_present_spikein_drug', 'CXG-DATA_present_embryo', 'CXG-DATA_present_VSG', 'CXG-DATA_present_STZ', 'gene_symbol_FINAL'
uns: 'BETA-DATA_hc_gene_programs_parsed_colors', 'BETA-DATA_hc_gene_programs_parsed_order', 'BETA-DATA_leiden_r1.5_parsed_colors', 'BETA-DATA_leiden_r1.5_parsed_order', 'cell_type_integrated_v2_colors', 'cell_type_integrated_v2_parsed_order', 'field_descriptions', 'study_colors', 'study_order', 'study_parsed_colors', 'study_parsed_order'
obsm: 'BETA-DATA_X_umap', 'BETA-DATA_X_umap_opt', 'X_integrated', 'X_umap'
>>> print(adata.obs.head())
study_sample study file ... CXG-DATA_sex_annotation cell_type_integrated_v2 cell_type_integrated_v2_parsed
index ...
CAAGATCGTCCAGTTA-1-SRR7610301-NOD_elimination NOD_elimination_SRR7610301 NOD_elimination SRR7610301 ... ground-truth ductal ductal
GATGAAAGTTGTCGCG-1-SRR7610298-NOD_elimination NOD_elimination_SRR7610298 NOD_elimination SRR7610298 ... ground-truth gamma gamma
AGTCTTTAGGAGCGTT-1-SRR7610301-NOD_elimination NOD_elimination_SRR7610301 NOD_elimination SRR7610301 ... ground-truth endothelial endothelial
CTTCCTTGTACCCAGC-1-MUC13640-VSG VSG_MUC13640 VSG MUC13640 ... ground-truth immune immune
CATCAAGAGATTACCC-1-SRR7610296-NOD_elimination NOD_elimination_SRR7610296 NOD_elimination SRR7610296 ... ground-truth stellate_quiescent stellate q.
[5 rows x 55 columns]
>>> print(adata.var.head())
gene_symbol used_integration gene_symbol_original_matched CXG-DATA_feature_is_filtered ... CXG-DATA_present_embryo CXG-DATA_present_VSG CXG-DATA_present_STZ gene_symbol_FINAL
EID ...
ENSMUSG00000000001 Gnai3 False Gnai3 False ... True True True Gnai3
ENSMUSG00000000003 Pbsn False Pbsn False ... True True True Pbsn
ENSMUSG00000000028 Cdc45 False Cdc45 False ... True True True Cdc45
ENSMUSG00000000031 H19 False H19 False ... True True True H19
ENSMUSG00000000037 Scml2 False Scml2 False ... True True True Scml2
[5 rows x 14 columns]
>>> print(adata.uns.keys())
dict_keys(['BETA-DATA_hc_gene_programs_parsed_colors', 'BETA-DATA_hc_gene_programs_parsed_order', 'BETA-DATA_leiden_r1.5_parsed_colors', 'BETA-DATA_leiden_r1.5_parsed_order', 'cell_type_integrated_v2_colors', 'cell_type_integrated_v2_parsed_order', 'field_descriptions', 'study_colors', 'study_order', 'study_parsed_colors', 'study_parsed_order'])
>>> print(adata.obsm.keys())
KeysView(AxisArrays with keys: BETA-DATA_X_umap, BETA-DATA_X_umap_opt, X_integrated, X_umap)
>>> print(adata.X[:5, :5])
<Compressed Sparse Row sparse matrix of dtype 'float64'
with 1 stored elements and shape (5, 5)>
Coords Values
(0, 0) 0.7450058814879538
>>> print(adata.obs.keys())
Index(['study_sample', 'study', 'file', 'reference', 'size_factors_sample',
'phase_cyclone', 's_cyclone', 'g2m_cyclone', 'g1_cyclone', 'sex',
'ins_score', 'ins_high', 'gcg_score', 'gcg_high', 'sst_score',
'sst_high', 'ppy_score', 'ppy_high', 'cell_filtering', 'age', 'strain',
'tissue', 'technique', 'study_sample_design', 'cell_type',
'cell_type_multiplet', 'cell_subtype', 'cell_subtype_multiplet',
'design', 'size_factors_integrated', 'pre_cell_type_unified',
'pre_cell_type_original', 'study_parsed', 'cell_type_parsed', 'low_q',
'BETA-DATA_leiden_r1.5', 'BETA-DATA_leiden_r20',
'BETA-DATA_hc_gene_programs', 'BETA-DATA_hc_gene_programs_parsed',
'BETA-DATA_leiden_r1.5_parsed', 'BETA-DATA_leiden_r1.5_parsed_const',
'CXG-DATA_n_genes', 'CXG-DATA_mt_frac', 'CXG-DATA_doublet_score',
'CXG-DATA_log10_n_counts', 'CXG-DATA_age_approxDays',
'CXG-DATA_cell_subtype_immune_reannotatedIntegrated',
'CXG-DATA_cell_subtype_endothelial_reannotatedIntegrated',
'CXG-DATA_emptyDrops_LogProb_scaled', 'CXG-DATA_diabetes_model',
'CXG-DATA_chemical_stress', 'CXG-DATA_GEO_accession',
'CXG-DATA_sex_annotation', 'cell_type_integrated_v2',
'cell_type_integrated_v2_parsed'],
dtype='object')
>>> print(adata.obs.cell_type_integrated_v2_parsed.keys())
Index(['CAAGATCGTCCAGTTA-1-SRR7610301-NOD_elimination',
'GATGAAAGTTGTCGCG-1-SRR7610298-NOD_elimination',
'AGTCTTTAGGAGCGTT-1-SRR7610301-NOD_elimination',
'CTTCCTTGTACCCAGC-1-MUC13640-VSG',
'CATCAAGAGATTACCC-1-SRR7610296-NOD_elimination',
'CAGAGAGCAACGATGG-1-G2-STZ', 'GTGCTTCCATTGCCTC-1-MUC13631-VSG',
'ATAGGCTCATGCAGCC-1-MUC13639-VSG', 'CCCTCCTCAGCGTCCA-1-G5-STZ',
'TACCTGCAGGAAAGGT-1-MUC13640-VSG',
...
'ATGAGGGCATTCCTCG-1-MUC13632-VSG',
'ACGCCGACACACATGT-1-mouse2-Fltp_adult',
'CACCAGGTCGAGGTAG-1-SRR7610302-NOD_elimination',
'TGCACCTCAGATCTGT-1-E15_5-embryo', 'TAAACCGAGACGACGT-1-G8-STZ',
'GACTAACGTGAGGCTA-1-SRR10751514-spikein_drug',
'CAGCAGCGTGTGCGTC-1-E14_5-embryo',
'TCGAGGCGTTGGTTTG-1-mouse2-Fltp_adult',
'CTCATTAGTAGGGTAC-1-SRR10751508-spikein_drug',
'CTTGGCTAGACAGACC-1-SRR7610302-NOD_elimination'],
dtype='object', name='index', length=301796)
>>> print(adata.obs['cell_type_integrated_v2_parsed'].unique())
['ductal', 'gamma', 'endothelial', 'immune', 'stellate q.', ..., 'alpha+beta', 'beta+gamma', 'delta+gamma', 'acinar', 'schwann']
Length: 20
Categories (20, object): ['E endo.' < 'E non-endo.' < 'alpha' < 'beta' ... 'beta+delta' <
'beta+gamma' < 'delta+gamma' < 'lowQ']