{"doi":"10.1214/10-aoas363","title":"Subsampling methods for genomic inference","abstract":"Large-scale statistical analysis of data sets associated with\\ngenome sequences plays an important role in modern biology. A\\nkey component of such statistical analyses is the computation of\\np-values and confidence bounds for statistics\\ndefined on the genome. Currently such computation is commonly\\nachieved through ad hoc simulation measures. The method of\\nrandomization, which is at the heart of these simulation\\nprocedures, can significantly affect the resulting statistical\\nconclusions. Most simulation schemes introduce a variety of\\nhidden assumptions regarding the nature of the randomness in the\\ndata, resulting in a failure to capture biologically meaningful\\nrelationships. To address the need for a method of assessing the\\nsignificance of observations within large scale genomic studies,\\nwhere there often exists a complex dependency structure between\\nobservations, we propose a unified solution built upon a data\\nsubsampling approach. We propose a piecewise stationary model\\nfor genome sequences and show that the subsampling approach\\ngives correct answers under this model. We illustrate the method\\non three simulation studies and two real data examples.","journal":"The Annals of Applied Statistics","year":2010,"id":1411,"datarank":4.549202113558515,"base_score":4.23410650459726,"endowment":4.23410650459726,"self_citation_contribution":0.635115975689589,"citation_network_contribution":3.9140861378689253,"self_endowment_contribution":0.635115975689589,"citer_contribution":3.9140861378689253,"corpus_percentile":null,"corpus_rank":null,"citation_count":68,"citer_count":64,"citers_with_citation_signal":62,"citers_with_endowment":62,"datacite_reuse_total":0,"is_dataset":false,"is_dataset_confidence":0.0442,"is_oa":true,"file_count":0,"downloads":0,"has_version_chain":false,"published_date":"2010-12-01","fair_score":61.25,"fair_percentile":92.70008795074759,"algorithm_id":"datarank_citation_only_1hop_v6","ranking_scope":"data_only","authors":[{"id":17024,"name":"Nathan Boley","orcid":"0000-0001-7114-2450","position":1,"is_corresponding":false},{"id":362,"name":"James B Brown","orcid":"0000-0002-5898-5848","position":2,"is_corresponding":false},{"id":17025,"name":"Haiyan Huang","orcid":"0009-0005-9761-8441","position":3,"is_corresponding":false},{"id":17026,"name":"Nancy R. Zhang","orcid":"0000-0002-0880-5749","position":4,"is_corresponding":false},{"id":17023,"name":"Peter J. Bickel","orcid":"0000-0001-7480-662X","position":0,"is_corresponding":true}],"reference_count":38,"raw_metadata":{"citation_network_status":"fetched"},"created_at":"2026-03-01T18:20:47.508186Z","pmid":null,"pmcid":null,"fwci":null,"citation_percentile":null,"influential_citations":0,"oa_status":null,"license":null,"views":0,"total_file_size_bytes":0,"version_count":0,"fair_f":100.0,"fair_a":70.0,"fair_i":50.0,"fair_r":25.0,"fair_zscore":1.4508,"fair_rationale":{"fair_score":61.25,"has_llm":false,"dimensions":{"F":{"name":"Findable","score":100.0,"criteria":[{"key":"f_has_doi","label":"Has a persistent DOI","kind":"deterministic","weight":1.0,"fraction":1.0,"signal":"DOI present","rationale":null},{"key":"f_repository_presence","label":"Indexed in repositories / literature DBs","kind":"deterministic","weight":1.0,"fraction":1.0,"signal":"datacite=1, pmcid=False, pmid=True","rationale":null},{"key":"f_persistent_ids","label":"Resolvable scholarly identifiers (OpenAlex)","kind":"deterministic","weight":0.5,"fraction":1.0,"signal":"OpenAlex id present","rationale":null}]},"A":{"name":"Accessible","score":70.0,"criteria":[{"key":"a_open_access","label":"Open Access / files deposited","kind":"deterministic","weight":1.5,"fraction":0.5,"signal":"files/OA location present but not flagged OA","rationale":null},{"key":"a_retrievable","label":"Free full text retrievable","kind":"deterministic","weight":1.0,"fraction":1.0,"signal":"11 OA location(s)","rationale":null}]},"I":{"name":"Interoperable","score":50.0,"criteria":[{"key":"i_linked_data","label":"Linked datasets / DataCite relations","kind":"deterministic","weight":1.0,"fraction":1.0,"signal":"linked_datasets=1, datacite=1","rationale":null},{"key":"i_standard_ids","label":"References data via standard accessions","kind":"deterministic","weight":1.0,"fraction":0.0,"signal":"accessions=0, trials=0","rationale":null}]},"R":{"name":"Reusable","score":25.0,"criteria":[{"key":"r_license","label":"Clear, open reuse license","kind":"deterministic","weight":1.5,"fraction":0.5,"signal":"license present (arXiv Non-Exclusive Distribution)","rationale":null},{"key":"r_downloads","label":"Demonstrated reuse (downloads)","kind":"deterministic","weight":0.5,"fraction":0.0,"signal":"downloads=0","rationale":null},{"key":"r_version","label":"Versioned / maintained","kind":"deterministic","weight":0.5,"fraction":0.0,"signal":"no version chain","rationale":null},{"key":"r_dataset","label":"Classified as a data resource","kind":"deterministic","weight":0.5,"fraction":0.0,"signal":"not a dataset","rationale":null}]}},"suggestions":["Reference data using standard accessions (e.g. GEO, PDB, ClinicalTrials.gov).","Maintain explicit versioning for the dataset.","Make the paper/data Open Access or deposit the files in an open repository.","Attach a clear, open reuse license (e.g. CC-BY or CC0)."],"model":null,"agent_version":"fair_agent_v1","fulltext_source":"oa_pdf"},"fair_model":null,"fair_agent_version":"fair_agent_v1","fair_fulltext_source":"oa_pdf","fair_has_llm":false,"fair_computed_at":"2026-06-11T05:31:41.679136Z","clinical_trials":[],"software_tools":[],"db_accessions":[],"linked_datasets":[],"topics":[]}