g1kv37 vs hg19
In order to create a class to translate the chromosome names from one naming convention to another. I've compared the MD5 sums of the human genome versions g1k/v37 and ucsc/hg19. Here is the java program to create the MD5s:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.*; | |
import java.security.MessageDigest; | |
public class FastaMD5 | |
{ | |
public static void main(String args[]) throws Exception | |
{ | |
int len=0; | |
byte[] buffer = new byte[1]; | |
MessageDigest complete = null; | |
for(;;) | |
{ | |
int c=System.in.read(); | |
switch(c) | |
{ | |
case -1: case '>': | |
{ | |
if(complete!=null) | |
{ | |
for(byte b:complete.digest()) | |
{ | |
System.out.print(Integer.toString( (b & 0xff ) + 0x100, 16).substring( 1 )); | |
} | |
System.out.println("\t"+len); | |
complete=null; | |
len=0; | |
} | |
if(c==-1) return; | |
while((c=System.in.read())!=-1 && c!='\n') System.out.print((char)c); | |
System.out.print('\t'); | |
complete=MessageDigest.getInstance("MD5"); | |
len=0; | |
break; | |
} | |
case '\n':case ' ':case '\r': break; | |
default: | |
{ | |
buffer[0]=(byte)Character.toUpperCase(c); | |
complete.update(buffer, 0, 1); | |
++len; | |
break; | |
} | |
} | |
} | |
} | |
} |
The MD5 sums were extracted as follow:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ curl -s "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz" | gunzip -c | java FastaMD5 > a.txt | |
$ curl -s "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz" | gunzip -c | tar Oxvf - 2> /dev/null | java FastaMD5 > b.txt | |
##join | |
$ join -t ' ' -1 2 -2 2 <(sort -t ' ' -k2,2 a.txt ) <(sort -t ' ' -k2,2 b.txt ) | cut -d ' ' -f 1,2,4 | sort -t ' ' -k3,3 | |
#unjoinable | |
$ join -t ' ' -1 2 -2 2 -v 1 -v 2 <(sort -t ' ' -k2,2 a.txt ) <(sort -t ' ' -k2,2 b.txt ) | sort -t ' ' -k2,2 |
Here are the common chromosomes, joined on the hash-sum:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1b22b98cdeb4a9304cb5d48026a85128 | 1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 | chr1 | |
---|---|---|---|
988c28e000e84c26d552359af1ea2e1d | 10 dna:chromosome chromosome:GRCh37:10:1:135534747:1 | chr10 | |
98c59049a2df285c76ffb1c6db8f8b96 | 11 dna:chromosome chromosome:GRCh37:11:1:135006516:1 | chr11 | |
06cbf126247d89664a4faebad130fe9c | GL000202.1 dna:supercontig supercontig::GL000202.1:1:40103:1 | chr11_gl000202_random | |
51851ac0e1a115847ad36449b0015864 | 12 dna:chromosome chromosome:GRCh37:12:1:133851895:1 | chr12 | |
283f8d7892baa81b510a015719ca7b0b | 13 dna:chromosome chromosome:GRCh37:13:1:115169878:1 | chr13 | |
98f3cae32b2a2e9524bc19813927542e | 14 dna:chromosome chromosome:GRCh37:14:1:107349540:1 | chr14 | |
e5645a794a8238215b2cd77acb95a078 | 15 dna:chromosome chromosome:GRCh37:15:1:102531392:1 | chr15 | |
fc9b1a7b42b97a864f56b348b06095e6 | 16 dna:chromosome chromosome:GRCh37:16:1:90354753:1 | chr16 | |
351f64d4f4f9ddd45b35336ad97aa6de | 17 dna:chromosome chromosome:GRCh37:17:1:81195210:1 | chr17 | |
96358c325fe0e70bee73436e8bb14dbd | GL000203.1 dna:supercontig supercontig::GL000203.1:1:37498:1 | chr17_gl000203_random | |
efc49c871536fa8d79cb0a06fa739722 | GL000204.1 dna:supercontig supercontig::GL000204.1:1:81310:1 | chr17_gl000204_random | |
d22441398d99caf673e9afb9a1908ec5 | GL000205.1 dna:supercontig supercontig::GL000205.1:1:174588:1 | chr17_gl000205_random | |
43f69e423533e948bfae5ce1d45bd3f1 | GL000206.1 dna:supercontig supercontig::GL000206.1:1:41001:1 | chr17_gl000206_random | |
b15d4b2d29dde9d3e4f93d1d0f2cbc9c | 18 dna:chromosome chromosome:GRCh37:18:1:78077248:1 | chr18 | |
f3814841f1939d3ca19072d9e89f3fd7 | GL000207.1 dna:supercontig supercontig::GL000207.1:1:4262:1 | chr18_gl000207_random | |
1aacd71f30db8e561810913e0b72636d | 19 dna:chromosome chromosome:GRCh37:19:1:59128983:1 | chr19 | |
aa81be49bf3fe63a79bdc6a6f279abf6 | GL000208.1 dna:supercontig supercontig::GL000208.1:1:92689:1 | chr19_gl000208_random | |
f40598e2a5a6b26e84a3775e0d1e2c81 | GL000209.1 dna:supercontig supercontig::GL000209.1:1:159169:1 | chr19_gl000209_random | |
d75b436f50a8214ee9c2a51d30b2c2cc | GL000191.1 dna:supercontig supercontig::GL000191.1:1:106433:1 | chr1_gl000191_random | |
325ba9e808f669dfeee210fdd7b470ac | GL000192.1 dna:supercontig supercontig::GL000192.1:1:547496:1 | chr1_gl000192_random | |
a0d9851da00400dec1098a9255ac712e | 2 dna:chromosome chromosome:GRCh37:2:1:243199373:1 | chr2 | |
0dec9660ec1efaaf33281c0d5ea2560f | 20 dna:chromosome chromosome:GRCh37:20:1:63025520:1 | chr20 | |
2979a6085bfe28e3ad6f552f361ed74d | 21 dna:chromosome chromosome:GRCh37:21:1:48129895:1 | chr21 | |
851106a74238044126131ce2a8e5847c | GL000210.1 dna:supercontig supercontig::GL000210.1:1:27682:1 | chr21_gl000210_random | |
a718acaa6135fdca8357d5bfe94211dd | 22 dna:chromosome chromosome:GRCh37:22:1:51304566:1 | chr22 | |
23dccd106897542ad87d2765d28a19a1 | 4 dna:chromosome chromosome:GRCh37:4:1:191154276:1 | chr4 | |
dbb6e8ece0b5de29da56601613007c2a | GL000193.1 dna:supercontig supercontig::GL000193.1:1:189789:1 | chr4_gl000193_random | |
6ac8f815bf8e845bb3031b73f812c012 | GL000194.1 dna:supercontig supercontig::GL000194.1:1:191469:1 | chr4_gl000194_random | |
0740173db9ffd264d728f32784845cd7 | 5 dna:chromosome chromosome:GRCh37:5:1:180915260:1 | chr5 | |
1d3a93a248d92a729ee764823acbbc6b | 6 dna:chromosome chromosome:GRCh37:6:1:171115067:1 | chr6 | |
618366e953d6aaad97dbe4777c29375e | 7 dna:chromosome chromosome:GRCh37:7:1:159138663:1 | chr7 | |
5d9ec007868d517e73543b005ba48535 | GL000195.1 dna:supercontig supercontig::GL000195.1:1:182896:1 | chr7_gl000195_random | |
96f514a9929e410c6651697bded59aec | 8 dna:chromosome chromosome:GRCh37:8:1:146364022:1 | chr8 | |
d92206d1bb4c3b4019c43c0875c06dc0 | GL000196.1 dna:supercontig supercontig::GL000196.1:1:38914:1 | chr8_gl000196_random | |
6f5efdd36643a9b8c8ccad6f2f1edc7b | GL000197.1 dna:supercontig supercontig::GL000197.1:1:37175:1 | chr8_gl000197_random | |
3e273117f15e0a400f01055d9f393768 | 9 dna:chromosome chromosome:GRCh37:9:1:141213431:1 | chr9 | |
868e7784040da90d900d2d1b667a1383 | GL000198.1 dna:supercontig supercontig::GL000198.1:1:90085:1 | chr9_gl000198_random | |
569af3b73522fab4b40995ae4944e78e | GL000199.1 dna:supercontig supercontig::GL000199.1:1:169874:1 | chr9_gl000199_random | |
75e4c8d17cd4addf3917d1703cacaf25 | GL000200.1 dna:supercontig supercontig::GL000200.1:1:187035:1 | chr9_gl000200_random | |
dfb7e7ec60ffdcb85cb359ea28454ee9 | GL000201.1 dna:supercontig supercontig::GL000201.1:1:36148:1 | chr9_gl000201_random | |
7daaa45c66b288847b9b32b964e623d3 | GL000211.1 dna:supercontig supercontig::GL000211.1:1:166566:1 | chrUn_gl000211 | |
563531689f3dbd691331fd6c5730a88b | GL000212.1 dna:supercontig supercontig::GL000212.1:1:186858:1 | chrUn_gl000212 | |
9d424fdcc98866650b58f004080a992a | GL000213.1 dna:supercontig supercontig::GL000213.1:1:164239:1 | chrUn_gl000213 | |
46c2032c37f2ed899eb41c0473319a69 | GL000214.1 dna:supercontig supercontig::GL000214.1:1:137718:1 | chrUn_gl000214 | |
5eb3b418480ae67a997957c909375a73 | GL000215.1 dna:supercontig supercontig::GL000215.1:1:172545:1 | chrUn_gl000215 | |
642a232d91c486ac339263820aef7fe0 | GL000216.1 dna:supercontig supercontig::GL000216.1:1:172294:1 | chrUn_gl000216 | |
6d243e18dea1945fb7f2517615b8f52e | GL000217.1 dna:supercontig supercontig::GL000217.1:1:172149:1 | chrUn_gl000217 | |
1d708b54644c26c7e01c2dad5426d38c | GL000218.1 dna:supercontig supercontig::GL000218.1:1:161147:1 | chrUn_gl000218 | |
f977edd13bac459cb2ed4a5457dba1b3 | GL000219.1 dna:supercontig supercontig::GL000219.1:1:179198:1 | chrUn_gl000219 | |
fc35de963c57bf7648429e6454f1c9db | GL000220.1 dna:supercontig supercontig::GL000220.1:1:161802:1 | chrUn_gl000220 | |
3238fb74ea87ae857f9c7508d315babb | GL000221.1 dna:supercontig supercontig::GL000221.1:1:155397:1 | chrUn_gl000221 | |
6fe9abac455169f50470f5a6b01d0f59 | GL000222.1 dna:supercontig supercontig::GL000222.1:1:186861:1 | chrUn_gl000222 | |
399dfa03bf32022ab52a846f7ca35b30 | GL000223.1 dna:supercontig supercontig::GL000223.1:1:180455:1 | chrUn_gl000223 | |
d5b2fc04f6b41b212a4198a07f450e20 | GL000224.1 dna:supercontig supercontig::GL000224.1:1:179693:1 | chrUn_gl000224 | |
63945c3e6962f28ffd469719a747e73c | GL000225.1 dna:supercontig supercontig::GL000225.1:1:211173:1 | chrUn_gl000225 | |
1c1b2cd1fccbc0a99b6a447fa24d1504 | GL000226.1 dna:supercontig supercontig::GL000226.1:1:15008:1 | chrUn_gl000226 | |
a4aead23f8053f2655e468bcc6ecdceb | GL000227.1 dna:supercontig supercontig::GL000227.1:1:128374:1 | chrUn_gl000227 | |
c5a17c97e2c1a0b6a9cc5a6b064b714f | GL000228.1 dna:supercontig supercontig::GL000228.1:1:129120:1 | chrUn_gl000228 | |
d0f40ec87de311d8e715b52e4c7062e1 | GL000229.1 dna:supercontig supercontig::GL000229.1:1:19913:1 | chrUn_gl000229 | |
b4eb71ee878d3706246b7c1dbef69299 | GL000230.1 dna:supercontig supercontig::GL000230.1:1:43691:1 | chrUn_gl000230 | |
ba8882ce3a1efa2080e5d29b956568a4 | GL000231.1 dna:supercontig supercontig::GL000231.1:1:27386:1 | chrUn_gl000231 | |
3e06b6741061ad93a8587531307057d8 | GL000232.1 dna:supercontig supercontig::GL000232.1:1:40652:1 | chrUn_gl000232 | |
7fed60298a8d62ff808b74b6ce820001 | GL000233.1 dna:supercontig supercontig::GL000233.1:1:45941:1 | chrUn_gl000233 | |
93f998536b61a56fd0ff47322a911d4b | GL000234.1 dna:supercontig supercontig::GL000234.1:1:40531:1 | chrUn_gl000234 | |
118a25ca210cfbcdfb6c2ebb249f9680 | GL000235.1 dna:supercontig supercontig::GL000235.1:1:34474:1 | chrUn_gl000235 | |
fdcd739913efa1fdc64b6c0cd7016779 | GL000236.1 dna:supercontig supercontig::GL000236.1:1:41934:1 | chrUn_gl000236 | |
e0c82e7751df73f4f6d0ed30cdc853c0 | GL000237.1 dna:supercontig supercontig::GL000237.1:1:45867:1 | chrUn_gl000237 | |
131b1efc3270cc838686b54e7c34b17b | GL000238.1 dna:supercontig supercontig::GL000238.1:1:39939:1 | chrUn_gl000238 | |
99795f15702caec4fa1c4e15f8a29c07 | GL000239.1 dna:supercontig supercontig::GL000239.1:1:33824:1 | chrUn_gl000239 | |
445a86173da9f237d7bcf41c6cb8cc62 | GL000240.1 dna:supercontig supercontig::GL000240.1:1:41933:1 | chrUn_gl000240 | |
ef4258cdc5a45c206cea8fc3e1d858cf | GL000241.1 dna:supercontig supercontig::GL000241.1:1:42152:1 | chrUn_gl000241 | |
2f8694fc47576bc81b5fe9e7de0ba49e | GL000242.1 dna:supercontig supercontig::GL000242.1:1:43523:1 | chrUn_gl000242 | |
cc34279a7e353136741c9fce79bc4396 | GL000243.1 dna:supercontig supercontig::GL000243.1:1:43341:1 | chrUn_gl000243 | |
0996b4475f353ca98bacb756ac479140 | GL000244.1 dna:supercontig supercontig::GL000244.1:1:39929:1 | chrUn_gl000244 | |
89bc61960f37d94abf0df2d481ada0ec | GL000245.1 dna:supercontig supercontig::GL000245.1:1:36651:1 | chrUn_gl000245 | |
e4afcd31912af9d9c2546acf1cb23af2 | GL000246.1 dna:supercontig supercontig::GL000246.1:1:38154:1 | chrUn_gl000246 | |
7de00226bb7df1c57276ca6baabafd15 | GL000247.1 dna:supercontig supercontig::GL000247.1:1:36422:1 | chrUn_gl000247 | |
5a8e43bec9be36c7b49c84d585107776 | GL000248.1 dna:supercontig supercontig::GL000248.1:1:39786:1 | chrUn_gl000248 | |
1d78abec37c15fe29a275eb08d5af236 | GL000249.1 dna:supercontig supercontig::GL000249.1:1:38502:1 | chrUn_gl000249 | |
7e0e2e580297b7764e31dbc80c2540dd | X dna:chromosome chromosome:GRCh37:X:1:155270560:1 | chrX |
And here are the unpairable data:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
d89517b400226d3b56e753972a7cad67 chr17_ctg5_hap1 1680828 | |
641e4338fa8d52a5b781bd2a2c08d3c3 chr3 198022430 | |
fa24f81b680df26bcfb6d69b784fbe36 chr4_ctg9_hap1 590426 | |
fe71bc63420d666884f37a3ad79f3317 chr6_apd_hap1 4622290 | |
18c17e1641ef04873b15f40f6c8659a4 chr6_cox_hap2 4795371 | |
2a3c677c426a10e137883ae1ffb8da3f chr6_dbb_hap3 4610396 | |
9d51d4152174461cd6715c7ddc588dc8 chr6_mann_hap4 4683263 | |
efed415dd8742349cb7aaca054675b9a chr6_mcf_hap5 4833398 | |
094d037050cad692b57ea12c4fef790f chr6_qbl_hap6 4611984 | |
3b6d666200e72bcc036bf88a4d7e0749 chr6_ssto_hap7 4928567 | |
d2ed829b8a1628d16cbeee88e88e39eb chrM 16571 | |
1e86411d73e6f00a10590f976be01623 chrY 59373566 | |
fdfd811849cc2fadebc929bb925902e5 3 dna:chromosome chromosome:GRCh37:3:1:198022430:1 198022430 | |
c68f52674c9fb33aef52dcf399755519 MT gi|251831106|ref|NC_012920.1| Homo sapiens mitochondrion, complete genome 16569 | |
1fa3474750af0948bdf97d5a0ee52e51 Y dna:chromosome chromosome:GRCh37:Y:2649521:59034049:1 59373566 |
I knew the problem for chrY ( http://www.biostars.org/p/58143/) but not for chr3.. What is the problem for this chromosome ?
Edit: Here are the number of bases for UCSC/chr3:
{T=58760485, G=38670110, A=58713343, C=38653197, N=3225295}and for g1kv37:
{T=58760485, G=38670110, A=58713343, R=2, C=38653197, M=1, N=3225292}
That's it,
Pierre.
2 comments:
Hello Pierre, about chr3 you help me to find an answer 2 years ago ;-)
http://www.biostars.org/p/9464/
My last comment in this page show a possible answer.
@pablo Haha :-) I didn't remember that post :-)
Post a Comment