g1kv37 vs hg19
In order to create a class to translate the chromosome names from one naming convention to another. I've compared the MD5 sums of the human genome versions g1k/v37 and ucsc/hg19. Here is the java program to create the MD5s:
import java.io.*; | |
import java.security.MessageDigest; | |
public class FastaMD5 | |
{ | |
public static void main(String args[]) throws Exception | |
{ | |
int len=0; | |
byte[] buffer = new byte[1]; | |
MessageDigest complete = null; | |
for(;;) | |
{ | |
int c=System.in.read(); | |
switch(c) | |
{ | |
case -1: case '>': | |
{ | |
if(complete!=null) | |
{ | |
for(byte b:complete.digest()) | |
{ | |
System.out.print(Integer.toString( (b & 0xff ) + 0x100, 16).substring( 1 )); | |
} | |
System.out.println("\t"+len); | |
complete=null; | |
len=0; | |
} | |
if(c==-1) return; | |
while((c=System.in.read())!=-1 && c!='\n') System.out.print((char)c); | |
System.out.print('\t'); | |
complete=MessageDigest.getInstance("MD5"); | |
len=0; | |
break; | |
} | |
case '\n':case ' ':case '\r': break; | |
default: | |
{ | |
buffer[0]=(byte)Character.toUpperCase(c); | |
complete.update(buffer, 0, 1); | |
++len; | |
break; | |
} | |
} | |
} | |
} | |
} |
The MD5 sums were extracted as follow:
$ curl -s "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz" | gunzip -c | java FastaMD5 > a.txt | |
$ curl -s "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz" | gunzip -c | tar Oxvf - 2> /dev/null | java FastaMD5 > b.txt | |
##join | |
$ join -t ' ' -1 2 -2 2 <(sort -t ' ' -k2,2 a.txt ) <(sort -t ' ' -k2,2 b.txt ) | cut -d ' ' -f 1,2,4 | sort -t ' ' -k3,3 | |
#unjoinable | |
$ join -t ' ' -1 2 -2 2 -v 1 -v 2 <(sort -t ' ' -k2,2 a.txt ) <(sort -t ' ' -k2,2 b.txt ) | sort -t ' ' -k2,2 |
Here are the common chromosomes, joined on the hash-sum:
1b22b98cdeb4a9304cb5d48026a85128 | 1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 | chr1 | |
---|---|---|---|
988c28e000e84c26d552359af1ea2e1d | 10 dna:chromosome chromosome:GRCh37:10:1:135534747:1 | chr10 | |
98c59049a2df285c76ffb1c6db8f8b96 | 11 dna:chromosome chromosome:GRCh37:11:1:135006516:1 | chr11 | |
06cbf126247d89664a4faebad130fe9c | GL000202.1 dna:supercontig supercontig::GL000202.1:1:40103:1 | chr11_gl000202_random | |
51851ac0e1a115847ad36449b0015864 | 12 dna:chromosome chromosome:GRCh37:12:1:133851895:1 | chr12 | |
283f8d7892baa81b510a015719ca7b0b | 13 dna:chromosome chromosome:GRCh37:13:1:115169878:1 | chr13 | |
98f3cae32b2a2e9524bc19813927542e | 14 dna:chromosome chromosome:GRCh37:14:1:107349540:1 | chr14 | |
e5645a794a8238215b2cd77acb95a078 | 15 dna:chromosome chromosome:GRCh37:15:1:102531392:1 | chr15 | |
fc9b1a7b42b97a864f56b348b06095e6 | 16 dna:chromosome chromosome:GRCh37:16:1:90354753:1 | chr16 | |
351f64d4f4f9ddd45b35336ad97aa6de | 17 dna:chromosome chromosome:GRCh37:17:1:81195210:1 | chr17 | |
96358c325fe0e70bee73436e8bb14dbd | GL000203.1 dna:supercontig supercontig::GL000203.1:1:37498:1 | chr17_gl000203_random | |
efc49c871536fa8d79cb0a06fa739722 | GL000204.1 dna:supercontig supercontig::GL000204.1:1:81310:1 | chr17_gl000204_random | |
d22441398d99caf673e9afb9a1908ec5 | GL000205.1 dna:supercontig supercontig::GL000205.1:1:174588:1 | chr17_gl000205_random | |
43f69e423533e948bfae5ce1d45bd3f1 | GL000206.1 dna:supercontig supercontig::GL000206.1:1:41001:1 | chr17_gl000206_random | |
b15d4b2d29dde9d3e4f93d1d0f2cbc9c | 18 dna:chromosome chromosome:GRCh37:18:1:78077248:1 | chr18 | |
f3814841f1939d3ca19072d9e89f3fd7 | GL000207.1 dna:supercontig supercontig::GL000207.1:1:4262:1 | chr18_gl000207_random | |
1aacd71f30db8e561810913e0b72636d | 19 dna:chromosome chromosome:GRCh37:19:1:59128983:1 | chr19 | |
aa81be49bf3fe63a79bdc6a6f279abf6 | GL000208.1 dna:supercontig supercontig::GL000208.1:1:92689:1 | chr19_gl000208_random | |
f40598e2a5a6b26e84a3775e0d1e2c81 | GL000209.1 dna:supercontig supercontig::GL000209.1:1:159169:1 | chr19_gl000209_random | |
d75b436f50a8214ee9c2a51d30b2c2cc | GL000191.1 dna:supercontig supercontig::GL000191.1:1:106433:1 | chr1_gl000191_random | |
325ba9e808f669dfeee210fdd7b470ac | GL000192.1 dna:supercontig supercontig::GL000192.1:1:547496:1 | chr1_gl000192_random | |
a0d9851da00400dec1098a9255ac712e | 2 dna:chromosome chromosome:GRCh37:2:1:243199373:1 | chr2 | |
0dec9660ec1efaaf33281c0d5ea2560f | 20 dna:chromosome chromosome:GRCh37:20:1:63025520:1 | chr20 | |
2979a6085bfe28e3ad6f552f361ed74d | 21 dna:chromosome chromosome:GRCh37:21:1:48129895:1 | chr21 | |
851106a74238044126131ce2a8e5847c | GL000210.1 dna:supercontig supercontig::GL000210.1:1:27682:1 | chr21_gl000210_random | |
a718acaa6135fdca8357d5bfe94211dd | 22 dna:chromosome chromosome:GRCh37:22:1:51304566:1 | chr22 | |
23dccd106897542ad87d2765d28a19a1 | 4 dna:chromosome chromosome:GRCh37:4:1:191154276:1 | chr4 | |
dbb6e8ece0b5de29da56601613007c2a | GL000193.1 dna:supercontig supercontig::GL000193.1:1:189789:1 | chr4_gl000193_random | |
6ac8f815bf8e845bb3031b73f812c012 | GL000194.1 dna:supercontig supercontig::GL000194.1:1:191469:1 | chr4_gl000194_random | |
0740173db9ffd264d728f32784845cd7 | 5 dna:chromosome chromosome:GRCh37:5:1:180915260:1 | chr5 | |
1d3a93a248d92a729ee764823acbbc6b | 6 dna:chromosome chromosome:GRCh37:6:1:171115067:1 | chr6 | |
618366e953d6aaad97dbe4777c29375e | 7 dna:chromosome chromosome:GRCh37:7:1:159138663:1 | chr7 | |
5d9ec007868d517e73543b005ba48535 | GL000195.1 dna:supercontig supercontig::GL000195.1:1:182896:1 | chr7_gl000195_random | |
96f514a9929e410c6651697bded59aec | 8 dna:chromosome chromosome:GRCh37:8:1:146364022:1 | chr8 | |
d92206d1bb4c3b4019c43c0875c06dc0 | GL000196.1 dna:supercontig supercontig::GL000196.1:1:38914:1 | chr8_gl000196_random | |
6f5efdd36643a9b8c8ccad6f2f1edc7b | GL000197.1 dna:supercontig supercontig::GL000197.1:1:37175:1 | chr8_gl000197_random | |
3e273117f15e0a400f01055d9f393768 | 9 dna:chromosome chromosome:GRCh37:9:1:141213431:1 | chr9 | |
868e7784040da90d900d2d1b667a1383 | GL000198.1 dna:supercontig supercontig::GL000198.1:1:90085:1 | chr9_gl000198_random | |
569af3b73522fab4b40995ae4944e78e | GL000199.1 dna:supercontig supercontig::GL000199.1:1:169874:1 | chr9_gl000199_random | |
75e4c8d17cd4addf3917d1703cacaf25 | GL000200.1 dna:supercontig supercontig::GL000200.1:1:187035:1 | chr9_gl000200_random | |
dfb7e7ec60ffdcb85cb359ea28454ee9 | GL000201.1 dna:supercontig supercontig::GL000201.1:1:36148:1 | chr9_gl000201_random | |
7daaa45c66b288847b9b32b964e623d3 | GL000211.1 dna:supercontig supercontig::GL000211.1:1:166566:1 | chrUn_gl000211 | |
563531689f3dbd691331fd6c5730a88b | GL000212.1 dna:supercontig supercontig::GL000212.1:1:186858:1 | chrUn_gl000212 | |
9d424fdcc98866650b58f004080a992a | GL000213.1 dna:supercontig supercontig::GL000213.1:1:164239:1 | chrUn_gl000213 | |
46c2032c37f2ed899eb41c0473319a69 | GL000214.1 dna:supercontig supercontig::GL000214.1:1:137718:1 | chrUn_gl000214 | |
5eb3b418480ae67a997957c909375a73 | GL000215.1 dna:supercontig supercontig::GL000215.1:1:172545:1 | chrUn_gl000215 | |
642a232d91c486ac339263820aef7fe0 | GL000216.1 dna:supercontig supercontig::GL000216.1:1:172294:1 | chrUn_gl000216 | |
6d243e18dea1945fb7f2517615b8f52e | GL000217.1 dna:supercontig supercontig::GL000217.1:1:172149:1 | chrUn_gl000217 | |
1d708b54644c26c7e01c2dad5426d38c | GL000218.1 dna:supercontig supercontig::GL000218.1:1:161147:1 | chrUn_gl000218 | |
f977edd13bac459cb2ed4a5457dba1b3 | GL000219.1 dna:supercontig supercontig::GL000219.1:1:179198:1 | chrUn_gl000219 | |
fc35de963c57bf7648429e6454f1c9db | GL000220.1 dna:supercontig supercontig::GL000220.1:1:161802:1 | chrUn_gl000220 | |
3238fb74ea87ae857f9c7508d315babb | GL000221.1 dna:supercontig supercontig::GL000221.1:1:155397:1 | chrUn_gl000221 | |
6fe9abac455169f50470f5a6b01d0f59 | GL000222.1 dna:supercontig supercontig::GL000222.1:1:186861:1 | chrUn_gl000222 | |
399dfa03bf32022ab52a846f7ca35b30 | GL000223.1 dna:supercontig supercontig::GL000223.1:1:180455:1 | chrUn_gl000223 | |
d5b2fc04f6b41b212a4198a07f450e20 | GL000224.1 dna:supercontig supercontig::GL000224.1:1:179693:1 | chrUn_gl000224 | |
63945c3e6962f28ffd469719a747e73c | GL000225.1 dna:supercontig supercontig::GL000225.1:1:211173:1 | chrUn_gl000225 | |
1c1b2cd1fccbc0a99b6a447fa24d1504 | GL000226.1 dna:supercontig supercontig::GL000226.1:1:15008:1 | chrUn_gl000226 | |
a4aead23f8053f2655e468bcc6ecdceb | GL000227.1 dna:supercontig supercontig::GL000227.1:1:128374:1 | chrUn_gl000227 | |
c5a17c97e2c1a0b6a9cc5a6b064b714f | GL000228.1 dna:supercontig supercontig::GL000228.1:1:129120:1 | chrUn_gl000228 | |
d0f40ec87de311d8e715b52e4c7062e1 | GL000229.1 dna:supercontig supercontig::GL000229.1:1:19913:1 | chrUn_gl000229 | |
b4eb71ee878d3706246b7c1dbef69299 | GL000230.1 dna:supercontig supercontig::GL000230.1:1:43691:1 | chrUn_gl000230 | |
ba8882ce3a1efa2080e5d29b956568a4 | GL000231.1 dna:supercontig supercontig::GL000231.1:1:27386:1 | chrUn_gl000231 | |
3e06b6741061ad93a8587531307057d8 | GL000232.1 dna:supercontig supercontig::GL000232.1:1:40652:1 | chrUn_gl000232 | |
7fed60298a8d62ff808b74b6ce820001 | GL000233.1 dna:supercontig supercontig::GL000233.1:1:45941:1 | chrUn_gl000233 | |
93f998536b61a56fd0ff47322a911d4b | GL000234.1 dna:supercontig supercontig::GL000234.1:1:40531:1 | chrUn_gl000234 | |
118a25ca210cfbcdfb6c2ebb249f9680 | GL000235.1 dna:supercontig supercontig::GL000235.1:1:34474:1 | chrUn_gl000235 | |
fdcd739913efa1fdc64b6c0cd7016779 | GL000236.1 dna:supercontig supercontig::GL000236.1:1:41934:1 | chrUn_gl000236 | |
e0c82e7751df73f4f6d0ed30cdc853c0 | GL000237.1 dna:supercontig supercontig::GL000237.1:1:45867:1 | chrUn_gl000237 | |
131b1efc3270cc838686b54e7c34b17b | GL000238.1 dna:supercontig supercontig::GL000238.1:1:39939:1 | chrUn_gl000238 | |
99795f15702caec4fa1c4e15f8a29c07 | GL000239.1 dna:supercontig supercontig::GL000239.1:1:33824:1 | chrUn_gl000239 | |
445a86173da9f237d7bcf41c6cb8cc62 | GL000240.1 dna:supercontig supercontig::GL000240.1:1:41933:1 | chrUn_gl000240 | |
ef4258cdc5a45c206cea8fc3e1d858cf | GL000241.1 dna:supercontig supercontig::GL000241.1:1:42152:1 | chrUn_gl000241 | |
2f8694fc47576bc81b5fe9e7de0ba49e | GL000242.1 dna:supercontig supercontig::GL000242.1:1:43523:1 | chrUn_gl000242 | |
cc34279a7e353136741c9fce79bc4396 | GL000243.1 dna:supercontig supercontig::GL000243.1:1:43341:1 | chrUn_gl000243 | |
0996b4475f353ca98bacb756ac479140 | GL000244.1 dna:supercontig supercontig::GL000244.1:1:39929:1 | chrUn_gl000244 | |
89bc61960f37d94abf0df2d481ada0ec | GL000245.1 dna:supercontig supercontig::GL000245.1:1:36651:1 | chrUn_gl000245 | |
e4afcd31912af9d9c2546acf1cb23af2 | GL000246.1 dna:supercontig supercontig::GL000246.1:1:38154:1 | chrUn_gl000246 | |
7de00226bb7df1c57276ca6baabafd15 | GL000247.1 dna:supercontig supercontig::GL000247.1:1:36422:1 | chrUn_gl000247 | |
5a8e43bec9be36c7b49c84d585107776 | GL000248.1 dna:supercontig supercontig::GL000248.1:1:39786:1 | chrUn_gl000248 | |
1d78abec37c15fe29a275eb08d5af236 | GL000249.1 dna:supercontig supercontig::GL000249.1:1:38502:1 | chrUn_gl000249 | |
7e0e2e580297b7764e31dbc80c2540dd | X dna:chromosome chromosome:GRCh37:X:1:155270560:1 | chrX |
And here are the unpairable data:
d89517b400226d3b56e753972a7cad67 chr17_ctg5_hap1 1680828 | |
641e4338fa8d52a5b781bd2a2c08d3c3 chr3 198022430 | |
fa24f81b680df26bcfb6d69b784fbe36 chr4_ctg9_hap1 590426 | |
fe71bc63420d666884f37a3ad79f3317 chr6_apd_hap1 4622290 | |
18c17e1641ef04873b15f40f6c8659a4 chr6_cox_hap2 4795371 | |
2a3c677c426a10e137883ae1ffb8da3f chr6_dbb_hap3 4610396 | |
9d51d4152174461cd6715c7ddc588dc8 chr6_mann_hap4 4683263 | |
efed415dd8742349cb7aaca054675b9a chr6_mcf_hap5 4833398 | |
094d037050cad692b57ea12c4fef790f chr6_qbl_hap6 4611984 | |
3b6d666200e72bcc036bf88a4d7e0749 chr6_ssto_hap7 4928567 | |
d2ed829b8a1628d16cbeee88e88e39eb chrM 16571 | |
1e86411d73e6f00a10590f976be01623 chrY 59373566 | |
fdfd811849cc2fadebc929bb925902e5 3 dna:chromosome chromosome:GRCh37:3:1:198022430:1 198022430 | |
c68f52674c9fb33aef52dcf399755519 MT gi|251831106|ref|NC_012920.1| Homo sapiens mitochondrion, complete genome 16569 | |
1fa3474750af0948bdf97d5a0ee52e51 Y dna:chromosome chromosome:GRCh37:Y:2649521:59034049:1 59373566 |
I knew the problem for chrY ( http://www.biostars.org/p/58143/) but not for chr3.. What is the problem for this chromosome ?
Edit: Here are the number of bases for UCSC/chr3:
{T=58760485, G=38670110, A=58713343, C=38653197, N=3225295}and for g1kv37:
{T=58760485, G=38670110, A=58713343, R=2, C=38653197, M=1, N=3225292}
That's it,
Pierre.