Skip to content

Commit 75e30e3

Browse files
committed
Add support for reading and writing splitting BAM index files.
1 parent b2bfb32 commit 75e30e3

File tree

7 files changed

+531
-0
lines changed

7 files changed

+531
-0
lines changed

src/main/java/htsjdk/samtools/BAMFileReader.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,12 @@ static long findVirtualOffsetOfFirstRecord(final File bam) throws IOException {
337337
return offset;
338338
}
339339

340+
/** Reads through the header and sequence records to find the virtual file offset of the first record in the BAM file. */
341+
static long findVirtualOffsetOfFirstRecord(final SeekableStream seekableStream) throws IOException {
342+
final BAMFileReader reader = new BAMFileReader(seekableStream, (SeekableStream) null, false, false, ValidationStringency.SILENT, new DefaultSAMRecordFactory());
343+
return reader.mFirstRecordPointer;
344+
}
345+
340346
/**
341347
* If true, writes the source of every read into the source SAMRecords.
342348
* @param enabled true to write source information into each SAMRecord.

src/main/java/htsjdk/samtools/SAMUtils.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
*/
2424
package htsjdk.samtools;
2525

26+
import htsjdk.samtools.seekablestream.SeekableStream;
2627
import htsjdk.samtools.util.BinaryCodec;
2728
import htsjdk.samtools.util.CigarUtil;
2829
import htsjdk.samtools.util.CloserUtil;
@@ -685,6 +686,18 @@ public static long findVirtualOffsetOfFirstRecordInBam(final File bamFile) {
685686
}
686687
}
687688

689+
/**
690+
* Returns the virtual file offset of the first record in a BAM file - i.e. the virtual file
691+
* offset after skipping over the text header and the sequence records.
692+
*/
693+
public static long findVirtualOffsetOfFirstRecordInBam(final SeekableStream seekableStream) {
694+
try {
695+
return BAMFileReader.findVirtualOffsetOfFirstRecord(seekableStream);
696+
} catch (final IOException ioe) {
697+
throw new RuntimeEOFException(ioe);
698+
}
699+
}
700+
688701
/**
689702
* Given a Cigar, Returns blocks of the sequence that have been aligned directly to the
690703
* reference sequence. Note that clipped portions, and inserted and deleted bases (vs. the reference)
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
package htsjdk.samtools;
2+
3+
import htsjdk.samtools.util.BinaryCodec;
4+
import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
5+
import htsjdk.samtools.util.RuntimeEOFException;
6+
import htsjdk.samtools.util.RuntimeIOException;
7+
8+
import java.io.BufferedInputStream;
9+
import java.io.IOException;
10+
import java.io.InputStream;
11+
import java.nio.file.Files;
12+
import java.nio.file.Path;
13+
import java.util.ArrayList;
14+
import java.util.Arrays;
15+
import java.util.List;
16+
import java.util.NavigableSet;
17+
import java.util.TreeSet;
18+
19+
/**
20+
* An index into BAM files, which records the file position of the start of every <i>n</i>th record.
21+
* Reads files that are created by {@link SplittingBAMIndexer}.
22+
*
23+
* <p>Indexes the positions of individual BAM records in the file.</p>
24+
*/
25+
public final class SplittingBAMIndex {
26+
public static final String FILE_EXTENSION = ".sbi";
27+
28+
/**
29+
* Splitting BAM index file magic number.
30+
*/
31+
static final byte[] SPLITTING_BAM_INDEX_MAGIC = "SBI\1".getBytes();
32+
33+
private final int granularity;
34+
private final NavigableSet<Long> virtualOffsets = new TreeSet<>();
35+
36+
/**
37+
* Create an in-memory splitting BAM index with the given virtual offsets.
38+
* @param virtualOffsets the offsets in the index
39+
*/
40+
public SplittingBAMIndex(final int granularity, final NavigableSet<Long> virtualOffsets) {
41+
this.granularity = granularity;
42+
this.virtualOffsets.addAll(virtualOffsets);
43+
if (virtualOffsets.isEmpty()) {
44+
throw new RuntimeIOException("Invalid splitting BAM index: should contain at least the file size");
45+
}
46+
}
47+
48+
/**
49+
* Load a splitting BAM index from a path.
50+
* @param path the path to the splitting BAM index
51+
* @throws IOException as per java IO contract
52+
*/
53+
public static SplittingBAMIndex load(final Path path) throws IOException {
54+
try (InputStream in = new BufferedInputStream(Files.newInputStream(path))) {
55+
return readIndex(in);
56+
}
57+
}
58+
59+
/**
60+
* Load a splitting BAM index from a stream.
61+
* @param in the stream to read the splitting BAM index from
62+
*/
63+
public static SplittingBAMIndex load(final InputStream in) {
64+
return readIndex(in);
65+
}
66+
67+
private static SplittingBAMIndex readIndex(final InputStream in) {
68+
BinaryCodec binaryCodec = new BinaryCodec(in);
69+
int granularity = readHeader(binaryCodec);
70+
NavigableSet<Long> virtualOffsets = new TreeSet<>();
71+
try {
72+
long prev = -1;
73+
while (true) {
74+
long cur = binaryCodec.readLong();
75+
if (prev > cur) {
76+
throw new RuntimeIOException(String.format(
77+
"Invalid splitting BAM index; offsets not in order: %#x > %#x",
78+
prev, cur));
79+
}
80+
virtualOffsets.add(prev = cur);
81+
}
82+
} catch (RuntimeEOFException e) {
83+
// signals end of index
84+
}
85+
return new SplittingBAMIndex(granularity, virtualOffsets);
86+
}
87+
88+
private static int readHeader(BinaryCodec binaryCodec) {
89+
final byte[] buffer = new byte[SPLITTING_BAM_INDEX_MAGIC.length];
90+
binaryCodec.readBytes(buffer);
91+
if (!Arrays.equals(buffer, SPLITTING_BAM_INDEX_MAGIC)) {
92+
throw new RuntimeIOException("Invalid file header in splitting BAM index: " + new String(buffer) + " (" + Arrays.toString(buffer) + ")");
93+
}
94+
int granularity = binaryCodec.readInt();
95+
return granularity;
96+
}
97+
98+
/**
99+
* Returns the granularity of the index, that is the number of alignments between subsequent entries in the index.
100+
* @return the granularity of the index
101+
*/
102+
public int getGranularity() {
103+
return granularity;
104+
}
105+
106+
/**
107+
* Returns the entries in the index.
108+
*
109+
* @return a set of file pointers for all the alignment offsets in the index, in ascending order. The last
110+
* virtual file pointer is the (virtual) length of the file.
111+
*/
112+
public NavigableSet<Long> getVirtualOffsets() {
113+
return new TreeSet<>(virtualOffsets);
114+
}
115+
116+
/**
117+
* Returns number of entries in the index.
118+
*
119+
* @return the number of virtual offsets in the index
120+
*/
121+
public int size() {
122+
return virtualOffsets.size();
123+
}
124+
125+
/**
126+
* Returns size of the BAM file, in bytes.
127+
*
128+
* @return the length of the BAM file, in bytes
129+
*/
130+
public long bamSize() {
131+
return BlockCompressedFilePointerUtil.getBlockAddress(virtualOffsets.last());
132+
}
133+
134+
/**
135+
* Split the BAM file for this index into non-overlapping chunks of roughly the given size that cover the whole
136+
* file and that can be read independently of one another.
137+
*
138+
* @param splitSize the rough size of each split in bytes
139+
* @return a list of contiguous, non-overlapping, sorted chunks that cover the whole BAM file
140+
* @see #getChunk(long, long)
141+
*/
142+
public List<Chunk> split(int splitSize) {
143+
if (splitSize <= 0) {
144+
throw new IllegalArgumentException(String.format("Split size must be positive: %s", splitSize));
145+
}
146+
long fileSize = bamSize();
147+
List<Chunk> chunks = new ArrayList<>();
148+
for (int splitStart = 0; splitStart < fileSize; splitStart += splitSize) {
149+
Chunk chunk = getChunk(splitStart, splitStart + splitSize);
150+
if (chunk != null) {
151+
chunks.add(chunk);
152+
}
153+
}
154+
return chunks;
155+
}
156+
157+
/**
158+
* Return a chunk that corresponds to the given range in the BAM file. Note that the chunk does not necessarily
159+
* completely cover the given range, however this method will map a set of contiguous, non-overlapping file ranges
160+
* that cover the whole BAM file to a set of contiguous, non-overlapping chunks that cover the whole file.
161+
*
162+
* @param splitStart the start of the file range (inclusive)
163+
* @param splitEnd the start of the file range (exclusive)
164+
* @return a chunk whose virtual start is at the first alignment start position that is greater than or equal to the
165+
* given split start position, and whose virtual end is at the first alignment start position that is greater than
166+
* or equal to the given split end position, or null if the chunk would be empty.
167+
* @see #split(int)
168+
*/
169+
public Chunk getChunk(long splitStart, long splitEnd) {
170+
if (splitStart >= splitEnd) {
171+
throw new IllegalArgumentException(String.format("Split start (%s) must be less than end (%s)", splitStart, splitEnd));
172+
}
173+
long fileSize = bamSize();
174+
if (splitEnd > fileSize) {
175+
splitEnd = fileSize;
176+
}
177+
long virtualSplitStart = BlockCompressedFilePointerUtil.makeFilePointer(splitStart);
178+
long virtualSplitEnd = BlockCompressedFilePointerUtil.makeFilePointer(splitEnd);
179+
Long virtualSplitStartAlignment = virtualOffsets.ceiling(virtualSplitStart);
180+
Long virtualSplitEndAlignment = virtualOffsets.ceiling(virtualSplitEnd);
181+
// neither virtualSplitStartAlignment nor virtualSplitEndAlignment should ever be null, but check anyway
182+
if (virtualSplitStartAlignment == null) {
183+
throw new IllegalArgumentException(String.format("No virtual offset found for virtual file pointer %s, last virtual offset %s", virtualSplitStart, virtualOffsets.last()));
184+
}
185+
if (virtualSplitEndAlignment == null) {
186+
throw new IllegalArgumentException(String.format("No virtual offset found for virtual file pointer %s, last virtual offset %s", virtualSplitEnd, virtualOffsets.last()));
187+
}
188+
if (virtualSplitStartAlignment.longValue() == virtualSplitEndAlignment.longValue()) {
189+
return null;
190+
}
191+
return new Chunk(virtualSplitStartAlignment, virtualSplitEndAlignment);
192+
}
193+
194+
@Override
195+
public boolean equals(Object o) {
196+
if (this == o) return true;
197+
if (o == null || getClass() != o.getClass()) return false;
198+
199+
SplittingBAMIndex that = (SplittingBAMIndex) o;
200+
201+
return virtualOffsets.equals(that.virtualOffsets);
202+
}
203+
204+
@Override
205+
public int hashCode() {
206+
return virtualOffsets.hashCode();
207+
}
208+
209+
@Override
210+
public String toString() {
211+
return virtualOffsets.toString();
212+
}
213+
}

0 commit comments

Comments
 (0)