Skip to content

Commit cbd9179

Browse files
committed
Update of examples and streaming weighted sampling
1 parent 8b89501 commit cbd9179

File tree

3 files changed

+61
-15
lines changed

3 files changed

+61
-15
lines changed
Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
using StreamingSampling
2-
using StatsBase
2+
using StatsBase: sample, Weights
33

44
# Define file paths
55
base = haskey(ENV, "BASE_PATH") ? ENV["BASE_PATH"] : "../../"
@@ -9,22 +9,14 @@ file_paths = ["$path/data1.txt",
99
"$path/data3.txt",
1010
"$path/data4.txt"];
1111

12-
# Compute streaming weights
13-
ws = compute_weights(file_paths; chunksize=500, subchunksize=100)
14-
1512
# Define sample size
1613
n = 100;
1714

18-
# Compute first-order inclusion probabilities with sample size n
19-
ps = inclusion_prob(ws, n)
20-
21-
# Option 1: Sample by weighted sampling
22-
inds_w = StatsBase.sample(1:length(ws), Weights(ws), n; replace=false)
23-
24-
# Option 2: Sample by weighted sampling and first-order inclusion probabilities
25-
inds_p = StatsBase.sample(1:length(ps), Weights(ps), n; replace=false)
15+
# Streaming weighted sampling
16+
ws = compute_weights(file_paths; chunksize=500, subchunksize=100)
17+
inds_w = sample(1:length(ws), Weights(ws), n; replace=false)
2618

27-
# Option 3: Sample by UPmaxentropy
28-
s = UPmaxentropy(ps)
19+
# Streaming maximum entropy sampling
20+
s = UPmaxentropy(inclusion_prob(ws, n))
2921
inds_me = findall(s .== 1)
3022

src/StreamWeights.jl

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Streaming Weights
2+
3+
mutable struct StreamWeights
4+
weights::Vector{Float64}
5+
chunksize::Int
6+
subchunksize::Int
7+
8+
function StreamWeights(file_paths::Vector{String};
9+
read_element=read_element,
10+
create_feature=create_feature,
11+
chunksize=2000,
12+
subchunksize=200,
13+
buffersize=32,
14+
max=Inf,
15+
randomized=true)
16+
ws = new(Vector{Float64}(),
17+
chunksize,
18+
subchunksize)
19+
sampler.weights = compute_weights(sampler, file_paths;
20+
read_element=read_element,
21+
create_feature=create_feature,
22+
chunksize=chunksize,
23+
subchunksize=subchunksize,
24+
buffersize=buffersize,
25+
max=max,
26+
randomized=randomized)
27+
return ws
28+
end
29+
30+
function StreamWeights(A::Vector;
31+
read_element=read_element,
32+
create_feature=create_feature,
33+
chunksize=2000,
34+
subchunksize=200,
35+
buffersize=32,
36+
max=Inf,
37+
randomized=true)
38+
ws = new(Vector{Float64}(),
39+
chunksize,
40+
subchunksize)
41+
ws.weights = compute_weights(sampler, A;
42+
read_element=read_element,
43+
create_feature=create_feature,
44+
chunksize=chunksize,
45+
subchunksize=subchunksize,
46+
buffersize=buffersize,
47+
max=max,
48+
randomized=randomized)
49+
return ws
50+
end
51+
end
52+
53+
54+

src/StreamingSampling.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ include("RelFreqs.jl")
3131

3232
# Sampling
3333
include("UPmaxentropy.jl")
34-
#include("StreamWeights.jl")
34+
include("StreamWeights.jl")
3535
include("StreamMaxEnt.jl")
3636
include("Sampling.jl")
3737

0 commit comments

Comments
 (0)