View Javadoc

1   /**
2    * Copyright (c) 2010 Yahoo! Inc. All rights reserved.
3    *
4    *   http://www.apache.org/licenses/LICENSE-2.0
5    *
6    * Unless required by applicable law or agreed to in writing, software
7    * distributed under the License is distributed on an "AS IS" BASIS,
8    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9    * See the License for the specific language governing permissions and
10   * limitations under the License.
11   */
12  package org.apache.omid.benchmarks.utils;
13  
14  import java.util.Random;
15  
16  /**
17   * A generator of a zipfian distribution. It produces a sequence of items, such that some items are more popular than others, according
18   * to a zipfian distribution. When you construct an instance of this class, you specify the number of items in the set to draw from, either
19   * by specifying an itemcount (so that the sequence is of items from 0 to itemcount-1) or by specifying a min and a max (so that the sequence is of 
20   * items from min to max inclusive). After you construct the instance, you can change the number of items by calling nextInt(itemcount) or nextLong(itemcount).
21   *
22   * Note that the popular items will be clustered together, e.g. item 0 is the most popular, item 1 the second most popular, and so on (or min is the most 
23   * popular, min+1 the next most popular, etc.) If you don't want this clustering, and instead want the popular items scattered throughout the 
24   * item space, then use ScrambledZipfianGenerator instead.
25   *
26   * Be aware: initializing this generator may take a long time if there are lots of items to choose from (e.g. over a minute
27   * for 100 million objects). This is because certain mathematical values need to be computed to properly generate a zipfian skew, and one of those
28   * values (zeta) is a sum sequence from 1 to n, where n is the itemcount. Note that if you increase the number of items in the set, we can compute
29   * a new zeta incrementally, so it should be fast unless you have added millions of items. However, if you decrease the number of items, we recompute
30   * zeta from scratch, so this can take a long time. 
31   *
32   * The algorithm used here is from "Quickly Generating Billion-Record Synthetic Databases", Jim Gray et al, SIGMOD 1994.
33   */
34  public class ZipfianGenerator extends IntegerGenerator {
35      public static final double ZIPFIAN_CONSTANT = 0.99;
36  
37      /**
38       * Number of items.
39       */
40      long items;
41  
42      /**
43       * Min item to generate.
44       */
45      long base;
46  
47      /**
48       * The zipfian constant to use.
49       */
50      double zipfianconstant;
51  
52      /**
53       * Computed parameters for generating the distribution.
54       */
55      double alpha, zetan, eta, theta, zeta2theta;
56  
57      /**
58       * The number of items used to compute zetan the last time.
59       */
60      long countforzeta;
61  
62      /**
63       * Flag to prevent problems. If you increase the number of items the zipfian generator is allowed to choose from, this code will incrementally compute a new zeta
64       * value for the larger itemcount. However, if you decrease the number of items, the code computes zeta from scratch; this is expensive for large itemsets.
65       * Usually this is not intentional; e.g. one thread thinks the number of items is 1001 and calls "nextLong()" with that item count; then another thread who thinks the
66       * number of items is 1000 calls nextLong() with itemcount=1000 triggering the expensive recomputation. (It is expensive for 100 million items, not really for 1000 items.) Why
67       * did the second thread think there were only 1000 items? maybe it read the item count before the first thread incremented it. So this flag allows you to say if you really do
68       * want that recomputation. If true, then the code will recompute zeta if the itemcount goes down. If false, the code will assume itemcount only goes up, and never recompute.
69       */
70      boolean allowitemcountdecrease = false;
71  
72      Random random = new Random();
73  
74      /******************************* Constructors **************************************/
75  
76      /**
77       * Create a zipfian generator for the specified number of items.
78       * @param _items The number of items in the distribution.
79       */
80      public ZipfianGenerator(long _items) {
81          this(0, _items - 1);
82      }
83  
84      /**
85       * Create a zipfian generator for items between min and max.
86       * @param _min The smallest integer to generate in the sequence.
87       * @param _max The largest integer to generate in the sequence.
88       */
89      public ZipfianGenerator(long _min, long _max) {
90          this(_min, _max, ZIPFIAN_CONSTANT);
91      }
92  
93      /**
94       * Create a zipfian generator for the specified number of items using the specified zipfian constant.
95       *
96       * @param _items The number of items in the distribution.
97       * @param _zipfianconstant The zipfian constant to use.
98       */
99      public ZipfianGenerator(long _items, double _zipfianconstant) {
100         this(0, _items - 1, _zipfianconstant);
101     }
102 
103     /**
104      * Create a zipfian generator for items between min and max (inclusive) for the specified zipfian constant.
105      * @param min The smallest integer to generate in the sequence.
106      * @param max The largest integer to generate in the sequence.
107      * @param _zipfianconstant The zipfian constant to use.
108      */
109     public ZipfianGenerator(long min, long max, double _zipfianconstant) {
110         this(min, max, _zipfianconstant, zetastatic(max - min + 1, _zipfianconstant));
111     }
112 
113     /**
114      * Create a zipfian generator for items between min and max (inclusive) for the specified zipfian constant, using the precomputed value of zeta.
115      *
116      * @param min The smallest integer to generate in the sequence.
117      * @param max The largest integer to generate in the sequence.
118      * @param _zipfianconstant The zipfian constant to use.
119      * @param _zetan The precomputed zeta constant.
120      */
121     public ZipfianGenerator(long min, long max, double _zipfianconstant, double _zetan) {
122 
123         items = max - min + 1;
124         base = min;
125         zipfianconstant = _zipfianconstant;
126 
127         theta = zipfianconstant;
128 
129         zeta2theta = zeta(2, theta);
130 
131 
132         alpha = 1.0 / (1.0 - theta);
133         //zetan=zeta(items,theta);
134         zetan = _zetan;
135         countforzeta = items;
136         eta = (1 - Math.pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan);
137 
138         //System.out.println("XXXX 3 XXXX");
139         nextInt();
140         //System.out.println("XXXX 4 XXXX");
141     }
142 
143     /**************************************************************************/
144 
145     /**
146      * Compute the zeta constant needed for the distribution. Do this from scratch for a distribution with n items, using the
147      * zipfian constant theta. Remember the value of n, so if we change the itemcount, we can recompute zeta.
148      *
149      * @param n The number of items to compute zeta over.
150      * @param theta The zipfian constant.
151      */
152     double zeta(long n, double theta) {
153         countforzeta = n;
154         return zetastatic(n, theta);
155     }
156 
157     /**
158      * Compute the zeta constant needed for the distribution. Do this from scratch for a distribution with n items, using the
159      * zipfian constant theta. This is a static version of the function which will not remember n.
160      * @param n The number of items to compute zeta over.
161      * @param theta The zipfian constant.
162      */
163     static double zetastatic(long n, double theta) {
164         return zetastatic(0, n, theta, 0);
165     }
166 
167     /**
168      * Compute the zeta constant needed for the distribution. Do this incrementally for a distribution that
169      * has n items now but used to have st items. Use the zipfian constant theta. Remember the new value of
170      * n so that if we change the itemcount, we'll know to recompute zeta.
171      *
172      * @param st The number of items used to compute the last initialsum
173      * @param n The number of items to compute zeta over.
174      * @param theta The zipfian constant.
175      * @param initialsum The value of zeta we are computing incrementally from.
176      */
177     double zeta(long st, long n, double theta, double initialsum) {
178         countforzeta = n;
179         return zetastatic(st, n, theta, initialsum);
180     }
181 
182     /**
183      * Compute the zeta constant needed for the distribution. Do this incrementally for a distribution that
184      * has n items now but used to have st items. Use the zipfian constant theta. Remember the new value of
185      * n so that if we change the itemcount, we'll know to recompute zeta.
186      * @param st The number of items used to compute the last initialsum
187      * @param n The number of items to compute zeta over.
188      * @param theta The zipfian constant.
189      * @param initialsum The value of zeta we are computing incrementally from.
190      */
191     static double zetastatic(long st, long n, double theta, double initialsum) {
192         double sum = initialsum;
193         for (long i = st; i < n; i++) {
194 
195             sum += 1 / (Math.pow(i + 1, theta));
196         }
197 
198         //System.out.println("countforzeta="+countforzeta);
199 
200         return sum;
201     }
202 
203     /****************************************************************************************/
204 
205     /**
206      * Generate the next item. this distribution will be skewed toward lower integers; e.g. 0 will
207      * be the most popular, 1 the next most popular, etc.
208      * @param itemcount The number of items in the distribution.
209      * @return The next item in the sequence.
210      */
211     public int nextInt(int itemcount) {
212         return (int) nextLong(itemcount);
213     }
214 
215     /**
216      * Generate the next item as a long.
217      *
218      * @param itemcount The number of items in the distribution.
219      * @return The next item in the sequence.
220      */
221     public long nextLong(long itemcount) {
222         //from "Quickly Generating Billion-Record Synthetic Databases", Jim Gray et al, SIGMOD 1994
223 
224         if (itemcount != countforzeta) {
225 
226             //have to recompute zetan and eta, since they depend on itemcount
227             synchronized (this) {
228                 if (itemcount > countforzeta) {
229                     //System.err.println("WARNING: Incrementally recomputing Zipfian distribtion. (itemcount="+itemcount+" countforzeta="+countforzeta+")");
230 
231                     //we have added more items. can compute zetan incrementally, which is cheaper
232                     zetan = zeta(countforzeta, itemcount, theta, zetan);
233                     eta = (1 - Math.pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan);
234                 } else if ((itemcount < countforzeta) && (allowitemcountdecrease)) {
235                     //have to start over with zetan
236                     //note : for large itemsets, this is very slow. so don't do it!
237 
238                     //TODO: can also have a negative incremental computation, e.g. if you decrease the number of items, then just subtract
239                     //the zeta sequence terms for the items that went away. This would be faster than recomputing from scratch when the number of items
240                     //decreases
241 
242                     System.err.println("WARNING: Recomputing Zipfian distribtion. This is slow and should be avoided. (itemcount=" + itemcount + " countforzeta=" + countforzeta + ")");
243 
244                     zetan = zeta(itemcount, theta);
245                     eta = (1 - Math.pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan);
246                 }
247             }
248         }
249 
250         double u = random.nextDouble();
251         double uz = u * zetan;
252 
253         if (uz < 1.0) {
254             return 0;
255         }
256 
257         if (uz < 1.0 + Math.pow(0.5, theta)) {
258             return 1;
259         }
260 
261         long ret = base + (long) ((itemcount) * Math.pow(eta * u - eta + 1, alpha));
262         setLastInt((int) ret);
263         return ret;
264     }
265 
266     /**
267      * Return the next value, skewed by the Zipfian distribution. The 0th item will be the most popular, followed by the 1st, followed
268      * by the 2nd, etc. (Or, if min != 0, the min-th item is the most popular, the min+1th item the next most popular, etc.) If you want the
269      * popular items scattered throughout the item space, use ScrambledZipfianGenerator instead.
270      */
271     @Override
272     public int nextInt() {
273         return (int) nextLong(items);
274     }
275 
276     /**
277      * Return the next value, skewed by the Zipfian distribution. The 0th item will be the most popular, followed by the 1st, followed
278      * by the 2nd, etc. (Or, if min != 0, the min-th item is the most popular, the min+1th item the next most popular, etc.) If you want the
279      * popular items scattered throughout the item space, use ScrambledZipfianGenerator instead.
280      */
281     public long nextLong() {
282         return nextLong(items);
283     }
284 
285     /**
286      * @todo Implement ZipfianGenerator.mean()
287      */
288     @Override
289     public double mean() {
290         throw new UnsupportedOperationException("@todo implement ZipfianGenerator.mean()");
291     }
292 }