Skip to content

Commit a851c3a

Browse files
committed
Add support for reading and writing SBI files for BAMs.
1 parent 38a24d5 commit a851c3a

File tree

7 files changed

+589
-0
lines changed

7 files changed

+589
-0
lines changed

src/main/java/htsjdk/samtools/BAMFileReader.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,12 @@ static long findVirtualOffsetOfFirstRecord(final File bam) throws IOException {
337337
return offset;
338338
}
339339

340+
/** Reads through the header and sequence records to find the virtual file offset of the first record in the BAM file. */
341+
static long findVirtualOffsetOfFirstRecord(final SeekableStream seekableStream) throws IOException {
342+
final BAMFileReader reader = new BAMFileReader(seekableStream, (SeekableStream) null, false, false, ValidationStringency.SILENT, new DefaultSAMRecordFactory());
343+
return reader.mFirstRecordPointer;
344+
}
345+
340346
/**
341347
* If true, writes the source of every read into the source SAMRecords.
342348
* @param enabled true to write source information into each SAMRecord.
@@ -944,6 +950,10 @@ public CloseableIterator<SAMRecord> createIndexIterator(final QueryInterval[] in
944950
return new BAMQueryFilteringIterator(iterator, new BAMQueryMultipleIntervalsIteratorFilter(intervals, contained));
945951
}
946952

953+
public long getVirtualFilePointer() {
954+
return mCompressedInputStream.getFilePointer();
955+
}
956+
947957
/**
948958
* Iterate over the SAMRecords defined by the sections of the file described in the ctor argument.
949959
*/
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package htsjdk.samtools;
2+
3+
import htsjdk.samtools.cram.io.InputStreamUtils;
4+
import htsjdk.samtools.seekablestream.SeekablePathStream;
5+
import htsjdk.samtools.seekablestream.SeekableStream;
6+
import htsjdk.samtools.util.BlockCompressedInputStream;
7+
import htsjdk.samtools.util.IOUtil;
8+
9+
import java.io.EOFException;
10+
import java.io.IOException;
11+
import java.io.OutputStream;
12+
import java.nio.ByteBuffer;
13+
import java.nio.ByteOrder;
14+
import java.nio.file.Files;
15+
import java.nio.file.Path;
16+
17+
/**
18+
* Writes SBI files for BAM files, as understood by {@link SBIIndex}.
19+
*/
20+
public final class BAMSBIIndexer {
21+
22+
/**
23+
* Perform indexing on the given BAM file, at the granularity level specified.
24+
*
25+
* @param bamFile the path to the BAM file
26+
* @param granularity write the offset of every n-th alignment to the index
27+
* @throws IOException as per java IO contract
28+
*/
29+
public static void createIndex(final Path bamFile, final long granularity) throws IOException {
30+
Path splittingBaiFile = IOUtil.addExtension(bamFile, SBIIndex.FILE_EXTENSION);
31+
try (SeekableStream in = new SeekablePathStream(bamFile); OutputStream out = Files.newOutputStream(splittingBaiFile)) {
32+
createIndex(in, out, granularity);
33+
}
34+
}
35+
36+
/**
37+
* Perform indexing on the given BAM file, at the granularity level specified.
38+
*
39+
* @param in a seekable stream for reading the BAM file from
40+
* @param out the stream to write the index to
41+
* @param granularity write the offset of every n-th alignment to the index
42+
* @throws IOException as per java IO contract
43+
*/
44+
public static void createIndex(final SeekableStream in, final OutputStream out, final long granularity) throws IOException {
45+
long recordStart = SAMUtils.findVirtualOffsetOfFirstRecordInBam(in);
46+
try (BlockCompressedInputStream blockIn = new BlockCompressedInputStream(in)) {
47+
blockIn.seek(recordStart);
48+
final ByteBuffer byteBuffer = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN); // BAM is little-endian
49+
SBIIndexWriter indexWriter = new SBIIndexWriter(out, granularity);
50+
while (true) {
51+
try {
52+
recordStart = blockIn.getFilePointer();
53+
InputStreamUtils.readFully(blockIn, byteBuffer.array(), 0, 4);
54+
final int blockSize = byteBuffer.getInt(0); // length of remainder of alignment record
55+
indexWriter.processRecord(recordStart);
56+
InputStreamUtils.skipFully(blockIn, blockSize);
57+
} catch (EOFException e) {
58+
break;
59+
}
60+
}
61+
indexWriter.writeVirtualOffset(recordStart);
62+
indexWriter.finish(in.length());
63+
}
64+
}
65+
}

src/main/java/htsjdk/samtools/SAMUtils.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
*/
2424
package htsjdk.samtools;
2525

26+
import htsjdk.samtools.seekablestream.SeekableStream;
2627
import htsjdk.samtools.util.BinaryCodec;
2728
import htsjdk.samtools.util.CigarUtil;
2829
import htsjdk.samtools.util.CloserUtil;
@@ -685,6 +686,18 @@ public static long findVirtualOffsetOfFirstRecordInBam(final File bamFile) {
685686
}
686687
}
687688

689+
/**
690+
* Returns the virtual file offset of the first record in a BAM file - i.e. the virtual file
691+
* offset after skipping over the text header and the sequence records.
692+
*/
693+
public static long findVirtualOffsetOfFirstRecordInBam(final SeekableStream seekableStream) {
694+
try {
695+
return BAMFileReader.findVirtualOffsetOfFirstRecord(seekableStream);
696+
} catch (final IOException ioe) {
697+
throw new RuntimeEOFException(ioe);
698+
}
699+
}
700+
688701
/**
689702
* Given a Cigar, Returns blocks of the sequence that have been aligned directly to the
690703
* reference sequence. Note that clipped portions, and inserted and deleted bases (vs. the reference)
Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
package htsjdk.samtools;
2+
3+
import htsjdk.samtools.util.BinaryCodec;
4+
import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
5+
6+
import java.io.BufferedInputStream;
7+
import java.io.IOException;
8+
import java.io.InputStream;
9+
import java.nio.file.Files;
10+
import java.nio.file.Path;
11+
import java.util.ArrayList;
12+
import java.util.Arrays;
13+
import java.util.List;
14+
import java.util.NavigableSet;
15+
import java.util.TreeSet;
16+
17+
/**
18+
* SBI is an index into BGZF-compressed data files, which has an entry for the file position of the start of every
19+
* <i>n</i>th record. Reads files that were created by {@link BAMSBIIndexer}.
20+
*/
21+
public final class SBIIndex {
22+
23+
public static class Header {
24+
private final long fileLength;
25+
private final byte[] md5;
26+
private final byte[] uuid;
27+
private final long totalNumberOfRecords;
28+
private final long granularity;
29+
30+
public Header(long fileLength, byte[] md5, byte[] uuid, long totalNumberOfRecords, long granularity) {
31+
this.fileLength = fileLength;
32+
this.md5 = md5;
33+
this.uuid = uuid;
34+
this.totalNumberOfRecords = totalNumberOfRecords;
35+
this.granularity = granularity;
36+
}
37+
38+
public long getFileLength() {
39+
return fileLength;
40+
}
41+
42+
public byte[] getMd5() {
43+
return md5;
44+
}
45+
46+
public byte[] getUuid() {
47+
return uuid;
48+
}
49+
50+
public long getTotalNumberOfRecords() {
51+
return totalNumberOfRecords;
52+
}
53+
54+
public long getGranularity() {
55+
return granularity;
56+
}
57+
}
58+
59+
public static final String FILE_EXTENSION = ".sbi";
60+
61+
/**
62+
* SBI magic number.
63+
*/
64+
static final byte[] SBI_MAGIC = "SBI\1".getBytes();
65+
66+
private final Header header;
67+
private final NavigableSet<Long> virtualOffsets;
68+
69+
/**
70+
* Create an in-memory SBI with the given virtual offsets.
71+
* @param virtualOffsets the offsets in the index
72+
*/
73+
public SBIIndex(final Header header, final NavigableSet<Long> virtualOffsets) {
74+
this.header = header;
75+
this.virtualOffsets = new TreeSet<>(virtualOffsets);
76+
if (this.virtualOffsets.isEmpty()) {
77+
throw new RuntimeException("Invalid SBI format: should contain at least one offset");
78+
}
79+
}
80+
81+
/**
82+
* Load an SBI into memory from a path.
83+
* @param path the path to the SBI file
84+
* @throws IOException as per java IO contract
85+
*/
86+
public static SBIIndex load(final Path path) throws IOException {
87+
try (InputStream in = new BufferedInputStream(Files.newInputStream(path))) {
88+
return readIndex(in);
89+
}
90+
}
91+
92+
/**
93+
* Load an SBI into memory from a stream.
94+
* @param in the stream to read the SBI from
95+
*/
96+
public static SBIIndex load(final InputStream in) {
97+
return readIndex(in);
98+
}
99+
100+
private static SBIIndex readIndex(final InputStream in) {
101+
BinaryCodec binaryCodec = new BinaryCodec(in);
102+
Header header = readHeader(binaryCodec);
103+
long numOffsets = binaryCodec.readLong();
104+
NavigableSet<Long> virtualOffsets = new TreeSet<>();
105+
long prev = -1;
106+
for (long i = 0; i < numOffsets; i++) {
107+
long cur = binaryCodec.readLong();
108+
if (prev > cur) {
109+
throw new RuntimeException(String.format(
110+
"Invalid SBI; offsets not in order: %#x > %#x",
111+
prev, cur));
112+
}
113+
virtualOffsets.add(prev = cur);
114+
}
115+
return new SBIIndex(header, virtualOffsets);
116+
}
117+
118+
private static Header readHeader(BinaryCodec binaryCodec) {
119+
final byte[] buffer = new byte[SBI_MAGIC.length];
120+
binaryCodec.readBytes(buffer);
121+
if (!Arrays.equals(buffer, SBI_MAGIC)) {
122+
throw new RuntimeException("Invalid file header in SBI: " + new String(buffer) + " (" + Arrays.toString(buffer) + ")");
123+
}
124+
long fileLength = binaryCodec.readLong();
125+
byte[] md5 = new byte[16];
126+
binaryCodec.readBytes(md5);
127+
byte[] uuid = new byte[16];
128+
binaryCodec.readBytes(uuid);
129+
long totalNumberOfRecords = binaryCodec.readLong();
130+
long granularity = binaryCodec.readLong();
131+
return new Header(fileLength, md5, uuid, totalNumberOfRecords, granularity);
132+
}
133+
134+
/**
135+
* Returns the granularity of the index, that is the number of alignments between subsequent entries in the index,
136+
* or zero if not specified.
137+
* @return the granularity of the index
138+
*/
139+
public long getGranularity() {
140+
return header.getGranularity();
141+
}
142+
143+
/**
144+
* Returns the entries in the index.
145+
*
146+
* @return a set of file pointers for all the alignment offsets in the index, in ascending order. The last
147+
* virtual file pointer is the position at which the next record would start if it were added to the file.
148+
*/
149+
public NavigableSet<Long> getVirtualOffsets() {
150+
return new TreeSet<>(virtualOffsets);
151+
}
152+
153+
/**
154+
* Returns number of entries in the index.
155+
*
156+
* @return the number of virtual offsets in the index
157+
*/
158+
public long size() {
159+
return virtualOffsets.size();
160+
}
161+
162+
/**
163+
* Returns the length of the data file in bytes.
164+
*
165+
* @return the length of the data file in bytes
166+
*/
167+
public long dataFileLength() {
168+
return header.getFileLength();
169+
}
170+
171+
/**
172+
* Split the data file for this index into non-overlapping chunks of roughly the given size that cover the whole
173+
* file and that can be read independently of one another.
174+
*
175+
* @param splitSize the rough size of each split in bytes
176+
* @return a list of contiguous, non-overlapping, sorted chunks that cover the whole data file
177+
* @see #getChunk(long, long)
178+
*/
179+
public List<Chunk> split(long splitSize) {
180+
if (splitSize <= 0) {
181+
throw new IllegalArgumentException(String.format("Split size must be positive: %s", splitSize));
182+
}
183+
long fileSize = dataFileLength();
184+
List<Chunk> chunks = new ArrayList<>();
185+
for (long splitStart = 0; splitStart < fileSize; splitStart += splitSize) {
186+
Chunk chunk = getChunk(splitStart, splitStart + splitSize);
187+
if (chunk != null) {
188+
chunks.add(chunk);
189+
}
190+
}
191+
return chunks;
192+
}
193+
194+
/**
195+
* Return a chunk that corresponds to the given range in the data file. Note that the chunk does not necessarily
196+
* completely cover the given range, however this method will map a set of contiguous, non-overlapping file ranges
197+
* that cover the whole data file to a set of contiguous, non-overlapping chunks that cover the whole data file.
198+
*
199+
* @param splitStart the start of the file range (inclusive)
200+
* @param splitEnd the start of the file range (exclusive)
201+
* @return a chunk whose virtual start is at the first alignment start position that is greater than or equal to the
202+
* given split start position, and whose virtual end is at the first alignment start position that is greater than
203+
* or equal to the given split end position, or null if the chunk would be empty.
204+
* @see #split(long)
205+
*/
206+
public Chunk getChunk(long splitStart, long splitEnd) {
207+
if (splitStart >= splitEnd) {
208+
throw new IllegalArgumentException(String.format("Split start (%s) must be less than end (%s)", splitStart, splitEnd));
209+
}
210+
long maxEnd = BlockCompressedFilePointerUtil.getBlockAddress(virtualOffsets.last());
211+
splitStart = Math.min(splitStart, maxEnd);
212+
splitEnd = Math.min(splitEnd, maxEnd);
213+
long virtualSplitStart = BlockCompressedFilePointerUtil.makeFilePointer(splitStart);
214+
long virtualSplitEnd = BlockCompressedFilePointerUtil.makeFilePointer(splitEnd);
215+
Long virtualSplitStartAlignment = virtualOffsets.ceiling(virtualSplitStart);
216+
Long virtualSplitEndAlignment = virtualOffsets.ceiling(virtualSplitEnd);
217+
// neither virtualSplitStartAlignment nor virtualSplitEndAlignment should ever be null, but check anyway
218+
if (virtualSplitStartAlignment == null) {
219+
throw new IllegalArgumentException(String.format("No virtual offset found for virtual file pointer %s, last virtual offset %s",
220+
BlockCompressedFilePointerUtil.asString(virtualSplitStart), BlockCompressedFilePointerUtil.asString(virtualOffsets.last())));
221+
}
222+
if (virtualSplitEndAlignment == null) {
223+
throw new IllegalArgumentException(String.format("No virtual offset found for virtual file pointer %s, last virtual offset %s",
224+
BlockCompressedFilePointerUtil.asString(virtualSplitEnd), BlockCompressedFilePointerUtil.asString(virtualOffsets.last())));
225+
}
226+
if (virtualSplitStartAlignment.longValue() == virtualSplitEndAlignment.longValue()) {
227+
return null;
228+
}
229+
return new Chunk(virtualSplitStartAlignment, virtualSplitEndAlignment);
230+
}
231+
232+
@Override
233+
public boolean equals(Object o) {
234+
if (this == o) return true;
235+
if (o == null || getClass() != o.getClass()) return false;
236+
237+
SBIIndex that = (SBIIndex) o;
238+
239+
return virtualOffsets.equals(that.virtualOffsets);
240+
}
241+
242+
@Override
243+
public int hashCode() {
244+
return virtualOffsets.hashCode();
245+
}
246+
247+
@Override
248+
public String toString() {
249+
return virtualOffsets.toString();
250+
}
251+
}

0 commit comments

Comments
 (0)