File tree Expand file tree Collapse file tree 3 files changed +61
-15
lines changed Expand file tree Collapse file tree 3 files changed +61
-15
lines changed Original file line number Diff line number Diff line change 11using  StreamingSampling
2- using  StatsBase
2+ using  StatsBase:  sample, Weights 
33
44#  Define file paths
55base =  haskey (ENV , " BASE_PATH" ?  ENV [" BASE_PATH" :  " ../../" 
@@ -9,22 +9,14 @@ file_paths = ["$path/data1.txt",
99              " $path /data3.txt" 
1010              " $path /data4.txt" 
1111
12- #  Compute streaming weights
13- ws =  compute_weights (file_paths; chunksize= 500 , subchunksize= 100 )
14- 
1512#  Define sample size
1613n =  100 ;
1714
18- #  Compute first-order inclusion probabilities with sample size n
19- ps =  inclusion_prob (ws, n)
20- 
21- #  Option 1: Sample by weighted sampling
22- inds_w =  StatsBase. sample (1 : length (ws), Weights (ws), n; replace= false )
23- 
24- #  Option 2: Sample by weighted sampling and first-order inclusion probabilities
25- inds_p =  StatsBase. sample (1 : length (ps), Weights (ps), n; replace= false )
15+ #  Streaming weighted sampling
16+ ws =  compute_weights (file_paths; chunksize= 500 , subchunksize= 100 )
17+ inds_w =  sample (1 : length (ws), Weights (ws), n; replace= false )
2618
27- #  Option 3: Sample by UPmaxentropy 
28- s =  UPmaxentropy (ps )
19+ #  Streaming maximum entropy sampling 
20+ s =  UPmaxentropy (inclusion_prob (ws, n) )
2921inds_me =  findall (s .==  1 )
3022
Original file line number Diff line number Diff line change 1+ #  Streaming Weights
2+ 
3+ mutable struct  StreamWeights
4+     weights:: Vector{Float64} 
5+     chunksize:: Int 
6+     subchunksize:: Int 
7+ 
8+     function  StreamWeights (file_paths:: Vector{String} ; 
9+                               read_element= read_element,
10+                               create_feature= create_feature,
11+                               chunksize= 2000 ,
12+                               subchunksize= 200 ,
13+                               buffersize= 32 ,
14+                               max= Inf ,
15+                               randomized= true )
16+         ws =  new (Vector {Float64} (),
17+                  chunksize,
18+                  subchunksize)
19+         sampler. weights =  compute_weights (sampler, file_paths;
20+                                           read_element= read_element,
21+                                           create_feature= create_feature,
22+                                           chunksize= chunksize,
23+                                           subchunksize= subchunksize,
24+                                           buffersize= buffersize,
25+                                           max= max,
26+                                           randomized= randomized)
27+         return  ws
28+     end 
29+     
30+     function  StreamWeights (A:: Vector ;
31+                               read_element= read_element,
32+                               create_feature= create_feature,
33+                               chunksize= 2000 ,
34+                               subchunksize= 200 ,
35+                               buffersize= 32 ,
36+                               max= Inf ,
37+                               randomized= true )
38+         ws =  new (Vector {Float64} (),
39+                  chunksize,
40+                  subchunksize)
41+         ws. weights =  compute_weights (sampler, A;
42+                                      read_element= read_element,
43+                                      create_feature= create_feature,
44+                                      chunksize= chunksize,
45+                                      subchunksize= subchunksize,
46+                                      buffersize= buffersize,
47+                                      max= max,
48+                                      randomized= randomized)
49+         return  ws
50+     end 
51+ end 
52+ 
53+ 
54+ 
Original file line number Diff line number Diff line change @@ -31,7 +31,7 @@ include("RelFreqs.jl")
3131
3232#  Sampling
3333include (" UPmaxentropy.jl" 
34- # include("StreamWeights.jl")
34+ include (" StreamWeights.jl" 
3535include (" StreamMaxEnt.jl" 
3636include (" Sampling.jl" 
3737
 
 
   
 
     
   
   
          
    
    
     
    
      
     
     
    You can’t perform that action at this time.
  
 
    
  
    
      
        
     
       
      
     
   
 
    
    
  
 
  
 
     
    
0 commit comments