Skip to content

Commit e375860

Browse files
committed
Use latest SBI code from htsjdk PR (samtools/htsjdk#1138)
1 parent 0bda7b8 commit e375860

File tree

8 files changed

+38
-32
lines changed

8 files changed

+38
-32
lines changed

build.gradle

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -282,10 +282,6 @@ dependencies {
282282
compile 'org.testng:testng:' + testNGVersion //compile instead of testCompile because it is needed for test infrastructure that needs to be packaged
283283
compile 'org.apache.hadoop:hadoop-minicluster:' + hadoopVersion
284284

285-
compile('org.seqdoop:hadoop-bam:' + hadoopBamVersion) {
286-
exclude group: 'org.apache.hadoop'
287-
exclude module: 'htsjdk'
288-
}
289285
compile files('lib/disq-0.0.1-SNAPSHOT.jar')
290286
compile('org.apache.hadoop:hadoop-client:' + hadoopVersion) // should be a 'provided' dependency
291287
compile('com.github.jsr203hadoop:jsr203hadoop:1.0.3')

src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
package org.broadinstitute.hellbender.tools.spark;
22

33
import htsjdk.samtools.*;
4+
import htsjdk.samtools.BAMSBIIndexer;
5+
import htsjdk.samtools.seekablestream.SeekableFileStream;
6+
import htsjdk.samtools.seekablestream.SeekableStream;
7+
import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
48
import org.apache.logging.log4j.LogManager;
59
import org.apache.logging.log4j.Logger;
610
import org.broadinstitute.barclay.argparser.Argument;
@@ -14,7 +18,6 @@
1418
import org.broadinstitute.hellbender.utils.io.IOUtils;
1519
import org.broadinstitute.hellbender.utils.read.ReadConstants;
1620
import org.codehaus.plexus.util.FileUtils;
17-
import org.seqdoop.hadoop_bam.SplittingBAMIndexer;
1821
import picard.cmdline.programgroups.OtherProgramGroup;
1922

2023
import java.io.*;
@@ -70,15 +73,15 @@ public final class CreateHadoopBamSplittingIndex extends CommandLineProgram {
7073

7174
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
7275
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
73-
doc = "The BAM splitting_index file. If this is unspecified an index will be created with the same name as " +
74-
"the input file but with the additional extension " + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION,
76+
doc = "The splitting index (SBI) file. If this is unspecified an index will be created with the same name as " +
77+
"the input file but with the additional extension " + SBIIndex.FILE_EXTENSION,
7578
optional = true)
7679
public File output;
7780

7881
@Argument(fullName = SPLITTING_INDEX_GRANULARITY_LONG_NAME,
7982
doc = "Splitting index granularity, an entry is created in the index every this many reads.",
8083
optional = true)
81-
public int granularity = SplittingBAMIndexer.DEFAULT_GRANULARITY;
84+
public long granularity = SBIIndexWriter.DEFAULT_GRANULARITY;
8285

8386
@Argument(fullName = CREATE_BAI_LONG_NAME,
8487
doc = "Set this to create a bai index at the same time as creating a splitting index",
@@ -89,7 +92,7 @@ public final class CreateHadoopBamSplittingIndex extends CommandLineProgram {
8992
@Override
9093
public Object doWork() {
9194
if( granularity <= 0) {
92-
throw new CommandLineException.BadArgumentValue(SPLITTING_INDEX_GRANULARITY_LONG_NAME, Integer.toString(granularity), "Granularity must be > 0");
95+
throw new CommandLineException.BadArgumentValue(SPLITTING_INDEX_GRANULARITY_LONG_NAME, Long.toString(granularity), "Granularity must be > 0");
9396
}
9497
final File index = getOutputFile(output, inputBam);
9598
if(createBai){
@@ -101,19 +104,17 @@ public Object doWork() {
101104
return 0;
102105
}
103106

104-
private static void createOnlySplittingIndex(final File inputBam, final File index, final int granularity) {
107+
private static void createOnlySplittingIndex(final File inputBam, final File index, final long granularity) {
105108
assertIsBam(inputBam);
106-
//createBamSplittingIndex(inputBam, getOutputFile(output, inputBam), readValidationStringency, granularity);
107-
try(BufferedInputStream in = new BufferedInputStream(new FileInputStream(inputBam));
108-
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
109-
SplittingBAMIndexer.index(in, out, inputBam.length(), granularity);
110-
109+
try(SeekableStream in = new SeekableFileStream(inputBam);
110+
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
111+
BAMSBIIndexer.createIndex(in, out, granularity);
111112
} catch (final IOException e) {
112113
throw new UserException("Couldn't create splitting index", e);
113114
}
114115
}
115116

116-
private static void createBaiAndSplittingIndex(final File inputBam, final File index, final int granularity, final ValidationStringency readValidationStringency) {
117+
private static void createBaiAndSplittingIndex(final File inputBam, final File index, final long granularity, final ValidationStringency readValidationStringency) {
117118
assertIsBam(inputBam);
118119
try(SamReader reader = SamReaderFactory.makeDefault()
119120
.validationStringency(readValidationStringency)
@@ -122,14 +123,24 @@ private static void createBaiAndSplittingIndex(final File inputBam, final File i
122123
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
123124
final SAMFileHeader header = reader.getFileHeader();
124125
assertBamIsCoordinateSorted(header);
125-
final SplittingBAMIndexer indexer = new SplittingBAMIndexer(out, granularity);
126+
final SBIIndexWriter indexer = new SBIIndexWriter(out, granularity);
126127

127128
final BAMIndexer bamIndexer = new BAMIndexer(IOUtils.replaceExtension(index, BAMIndex.BAMIndexSuffix), header);
129+
BAMFileSpan lastFilePointer = null;
128130
for(final SAMRecord read : reader){
129-
indexer.processAlignment(read);
131+
BAMFileSpan filePointer = (BAMFileSpan) read.getFileSource().getFilePointer();
132+
indexer.processRecord(filePointer.getFirstOffset());
130133
bamIndexer.processAlignment(read);
134+
lastFilePointer = filePointer;
135+
}
136+
long nextStart = 0;
137+
if (lastFilePointer != null && !lastFilePointer.getChunks().isEmpty()) {
138+
nextStart = lastFilePointer.getChunks().get(0).getChunkEnd();
139+
}
140+
if (nextStart == 0) {
141+
nextStart = BlockCompressedFilePointerUtil.makeFilePointer(inputBam.length()); // default to file length (in case of no reads)
131142
}
132-
indexer.finish(inputBam.length());
143+
indexer.finish(nextStart, inputBam.length()); // nextStart is start of next record that would be added
133144
bamIndexer.finish();
134145
} catch (final IOException e) {
135146
throw new UserException("Couldn't create splitting index", e);
@@ -153,11 +164,11 @@ private static void assertIsBam(final File inputBam) {
153164

154165
private static File getOutputFile(final File suggestedOutput, final File input) {
155166
if(suggestedOutput == null){
156-
return new File(input.getPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
167+
return new File(input.getPath() + SBIIndex.FILE_EXTENSION);
157168
} else {
158-
if (!suggestedOutput.getAbsolutePath().endsWith("bam" + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION)){
159-
logger.warn("Creating a splitting index with an extension that doesn't match "
160-
+ "bam"+SplittingBAMIndexer.OUTPUT_FILE_EXTENSION + ". Output file: "+suggestedOutput);
169+
if (!suggestedOutput.getAbsolutePath().endsWith("bam" + SBIIndex.FILE_EXTENSION)){
170+
logger.warn("Creating a splitting index (SBI) with an extension that doesn't match "
171+
+ "bam"+SBIIndex.FILE_EXTENSION + ". Output file: "+suggestedOutput);
161172
}
162173
return suggestedOutput;
163174
}

src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import htsjdk.samtools.SAMFileHeader;
55
import htsjdk.samtools.SAMRecord;
66
import htsjdk.samtools.SAMRecordCoordinateComparator;
7+
import htsjdk.samtools.SBIIndex;
78
import org.apache.hadoop.conf.Configuration;
89
import org.apache.hadoop.fs.FileSystem;
910
import org.apache.hadoop.fs.Path;
@@ -19,7 +20,6 @@
1920
import org.broadinstitute.hellbender.GATKBaseTest;
2021
import org.broadinstitute.hellbender.utils.test.MiniClusterUtils;
2122
import org.broadinstitute.hellbender.utils.test.ReadTestUtils;
22-
import org.seqdoop.hadoop_bam.SplittingBAMIndexer;
2323
import org.testng.Assert;
2424
import org.testng.annotations.AfterClass;
2525
import org.testng.annotations.BeforeClass;
@@ -155,7 +155,7 @@ private void assertSingleShardedWritingWorks(String inputBam, String referenceFi
155155

156156
// check that a splitting bai file is created
157157
if (IOUtils.isBamFileName(outputPath)) {
158-
//Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION)));
158+
//Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + SBIIndex.FILE_EXTENSION)));
159159
}
160160

161161
JavaRDD<GATKRead> rddParallelReads2 = readSource.getParallelReads(outputPath, referenceFile);

src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
package org.broadinstitute.hellbender.tools.spark;
22

33
import htsjdk.samtools.BAMIndex;
4+
import htsjdk.samtools.SBIIndex;
45
import htsjdk.samtools.util.IOUtil;
56
import org.broadinstitute.barclay.argparser.CommandLineException;
67
import org.broadinstitute.hellbender.CommandLineProgramTest;
78
import org.broadinstitute.hellbender.exceptions.UserException;
89
import org.broadinstitute.hellbender.utils.io.IOUtils;
910
import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder;
10-
import org.seqdoop.hadoop_bam.SplittingBAMIndex;
11-
import org.seqdoop.hadoop_bam.SplittingBAMIndexer;
1211
import org.testng.Assert;
1312
import org.testng.annotations.DataProvider;
1413
import org.testng.annotations.Test;
@@ -43,15 +42,15 @@ public void testCreateSplittingIndex(final File bam) throws IOException {
4342
assertIndexIsNotEmpty(splittingIndex);
4443

4544
//checked in index created with
46-
// java -cp target/hadoop-bam-7.4.1-SNAPSHOT-jar-with-dependencies.jar org.seqdoop.hadoop_bam.SplittingBAMIndexer 1 <filename>
47-
final File expectedSplittingIndex = new File(bam.toPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
45+
// ./gatk CreateHadoopBamSplittingIndex --input <filename> --splitting-index-granularity 1
46+
final File expectedSplittingIndex = new File(bam.toPath() + SBIIndex.FILE_EXTENSION);
4847

4948
IOUtil.assertFilesEqual(splittingIndex, expectedSplittingIndex);
5049
}
5150

5251
private static void assertIndexIsNotEmpty(final File splittingIndex) throws IOException {
5352
Assert.assertTrue(splittingIndex.exists());
54-
final SplittingBAMIndex splittingBAMIndex = new SplittingBAMIndex(splittingIndex);
53+
final SBIIndex splittingBAMIndex = SBIIndex.load(splittingIndex.toPath());
5554
Assert.assertTrue(splittingBAMIndex.size() > 0 );
5655
}
5756

@@ -82,7 +81,7 @@ public void testUnspecifiedOutputProducesAdjacentIndex(final File bam) throws IO
8281
// we're going to write an index next to it on disk, and we don't want to write into the test resources folder
8382
final File bamCopy = createTempFile("copy-"+bam, ".bam");
8483
Files.copy(bam.toPath(), bamCopy.toPath(), StandardCopyOption.REPLACE_EXISTING);
85-
final File expectedIndex = new File(bamCopy.toPath() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
84+
final File expectedIndex = new File(bamCopy.toPath() + SBIIndex.FILE_EXTENSION);
8685
Assert.assertFalse(expectedIndex.exists());
8786
final ArgumentsBuilder args = new ArgumentsBuilder().addInput(bamCopy);
8887
this.runCommandLine(args);
@@ -131,7 +130,7 @@ public void testCantCreateBaiForUnsortedFile(){
131130
}
132131

133132
private static File getTempIndexFile() {
134-
return createTempFile("index", "bam.splitting-bai");
133+
return createTempFile("index", "bam" + SBIIndex.FILE_EXTENSION);
135134
}
136135

137136

0 commit comments

Comments
 (0)