blob: b3adb7e1ff63a2401dc66c151e74c5deb91ac6f8 [file] [log] [blame]
# BEGIN LICENSE BLOCK
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Places Test code.
#
# The Initial Developer of the Original Code is Mozilla Corp.
# Portions created by the Initial Developer are Copyright (C) 2009
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# David Dahl <ddahl@mozilla.com>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# END LICENSE BLOCK
"""
using the 'Places generator'
----------------------------
Requirements: django.db - you should have a full install of django 1.0.2 release. We are using django solely for it's ORM and 'manage.py inspectdb' utility which reverse-engineers a db schema into django Model classes.
You must have django and the 'places' python module in your PYTHONPATH
in my ~/.zshrc I have:
export PYTHONPATH=$PYTHONPATH:~/code/python:~/code/mozilla-central/mozilla/toolkit/components/places/tests/generator/places
inside ~/code/mozilla-central/mozilla/toolkit/components/places/tests/generator is the places/ Django 'project'
You must set 2 ENV VARS:
1) PLACES_DB_PATH
e.g.: export PLACES_DB_PATH=~/code/python/places/places.sqlite
2) DJANGO_SETTINGS_MODULE
e.g.: export DJANGO_SETTINGS_MODULE=places.settings
Generate Script Docs:
python places/builddb/generate.py --help
================================================
This will pump your places.sqlite with a bunch of
ficticious places, history and bookmarks based on the current 'max' aggregation at https://places-stats.mozilla.com/
"""
import os
from optparse import OptionParser
import random
import urllib2
import random
import time
from copy import deepcopy
from math import ceil, floor
from places import uuid
from places.builddb.models import *
from places.builddb.http import get_web_data
"""
Places data generator. Create as many places/history/bookmarks as needed for testing Places queries for performance.
Tunable parameters: absolute # visits, bookmarks, age:
create random dates up to today,
"""
DEBUG = True
PERCENT_PLACES_TYPED = 0.3
TWENTY_FOUR_HOURS = (60 * 60 * 24 * 1000 * 1000)
from django.db import connection, transaction
class DateForward(object):
"""
go through the places db and increment the dates by one day
moz_annos.dateAdded
moz_annos.lastModified
moz_annos.expiration
moz_bookmarks.dateAdded
moz_bookmarks.lastModified
moz_favicons.expiration
moz_historyvisits.visit_date
moz_items_annos.dateAdded
moz_items_annos.expiration
moz_items_annos.lastModified
"""
def __init__(self):
pass
def table_column(self):
"""
keeps track of the tables/columns that need to be updated
"""
table_cols = {'moz_annos': ['dateadded', 'lastmodified', 'expiration', ],
'moz_bookmarks': ['dateadded', 'lastmodified', ],
'moz_favicons': ['expiration'],
'moz_historyvisits': ['visit_date'],
'moz_items_annos': ['dateadded', 'lastmodified', 'expiration', ]}
return table_cols
def place_id(self):
sql = """select id from moz_places where title = 'Places Testing Placeholder'"""
cursor = connection.cursor()
rows = cursor.execute(sql, [])
id = rows.fetchall()[0][0]
return id
def last_update(self):
"""
get the last update date
"""
id = self.place_id()
sql = """SELECT visit_date FROM moz_historyvisits WHERE place_id = %s ORDER BY id DESC""" % id
cursor = connection.cursor()
rows = cursor.execute(sql)
_rows = rows.fetchall()
return _rows[0][0]
def increment_dates(self):
"""
update the dates
"""
last_update_seconds = ((self.last_update() / 1000) / 1000)
now = int(time.time())
self.now_micro_secs = ((now * 1000) * 1000)
if DEBUG:
print "last update microseconds: %s" % self.last_update()
print "now: %s" % self.now_micro_secs
time_diff = (now - last_update_seconds)
self.time_diff = ((time_diff * 1000) * 1000)
if DEBUG:
print "\nTIME DIFF (microseconds): %s\n\n" % str(self.time_diff)
if DEBUG:
print "Incrementing History"
self.increment_history()
if DEBUG:
print "Incrementing Bookmarks"
self.increment_bookmarks()
if DEBUG:
print "Incrementing Favicons"
self.increment_favicons()
if DEBUG:
print "Incrementing Annos"
self.increment_annos()
if DEBUG:
print "Incrementing Items Annos"
self.increment_items_annos()
if DEBUG:
print "VACUUM/REINDEX"
self.vacuum_and_reindex()
last_time_run()
def increment_history(self):
self.increment_query(
'moz_historyvisits', 'visit_date', restrict_update=True)
def increment_bookmarks(self):
self.increment_query('moz_bookmarks', 'dateadded')
self.increment_query('moz_bookmarks', 'lastmodified')
def date_missing(self, _date):
"""
check to see if the date is None or 0, if so, return True
"""
if _date == 0 or _date is None or _date == '':
return True
else:
return False
def increment_query(self, table_name, col_name, restrict_update=False):
"""
shorter, faster updates of the date column
"""
if restrict_update:
# get the place_id
place_id = self.place_id()
try:
sql = "UPDATE %s SET %s " % (table_name, col_name,)
sql = sql + " = %s + %s"
if restrict_update:
where = " WHERE %s < %s AND place_id != %s AND %s != ''" % (
col_name, self.now_micro_secs, place_id, col_name,)
else:
where = " WHERE %s < %s AND %s != ''" % \
(col_name, self.now_micro_secs, col_name, )
sql = sql + where
debug_sql = sql % (col_name, self.time_diff,)
if DEBUG:
print debug_sql
cursor = connection.cursor()
cursor.execute(debug_sql)
cursor.close()
except Exception, e:
print """************ ERROR *******************"""
print """***%s""" % e
raise
def increment_favicons(self):
"""
uses a simpler approach via a cursor
"""
self.increment_query('moz_favicons', 'expiration')
def increment_annos(self):
self.increment_query('moz_annos', 'dateadded')
self.increment_query('moz_annos', 'lastmodified')
self.increment_query('moz_annos', 'expiration')
def increment_items_annos(self):
self.increment_query('moz_items_annos', 'dateadded')
self.increment_query('moz_items_annos', 'lastmodified')
self.increment_query('moz_items_annos', 'expiration')
def vacuum_and_reindex(self):
try:
v_sql = "VACUUM;"
r_sql = "REINDEX;"
cursor = connection.cursor()
cursor.execute(v_sql)
cursor.execute(r_sql)
cursor.close()
except Exception, e:
print """************ ERROR *******************"""
print """***%s""" % e
raise
def right_now():
"""
right now! in microseconds
"""
return time.time() * 1000 * 1000
def six_months_ago():
"""
time in microseconds six months ago (roughly)
"""
six_months = (60 * 60 * 24 * 180 * 1000 * 1000)
n = right_now()
smo = n - six_months
return smo
def random_date():
"""
a random date (in microseconds since epoch) up to six months ago
"""
six_months = six_months_ago()
now = right_now()
s = random.randint(int(six_months), int(now))
return s
def alpha():
return map(chr, range(97, 123))
ALPHA = alpha()
def numbers():
return [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
def new_favicon():
"""
create a new, unique favicon
"""
guid = uuid.generate()
url = 'http://tld.tld/%s.ico' % guid
favicon = MozFavicons.objects.create(
url=url,
data='wootuyiuyeryuwyerywueyurywueyruuewryueyuryuewyurkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjsdhjshjdhjshdjhjsdhjshjkdhjshjdhjshkjdhsjkhdkjhsjdhjshjhdd;sklfnlk;dnfjnklwmnkldjfklhadlfgjkhrnfjkgb;jlshgn;ruignifng;irgn;srhg;rnv;irn;',
mime_type='image/png',
expiration=random_date())
return favicon
def reverse_host(url):
"""
reverse the host
"""
parts = urllib2.urlparse.urlparse(url)
ru = "".join(reversed(parts[1])) + '.'
return ru
def url_parts():
"""
return a dictionary like: {'proto':'http'
'host':'www',
'domain':'foo',
'tld':'com'}
"""
protocol = ['https', 'http', 'ftp']
host_len = random.randint(4, 26)
host = "".join(random.sample(ALPHA, host_len))
domain_len = random.randint(2, 26)
domain = "".join(random.sample(ALPHA, domain_len))
tld_len = random.randint(2, 3)
tld = "".join(random.sample(ALPHA, tld_len))
proto_idx = random.randint(0, 2)
proto = protocol[proto_idx]
return {'proto': proto, 'host': host, 'domain': domain, 'tld': tld}
def new_title():
"""
return a random title
"""
title_len = random.randint(3, 26)
title = "".join(random.sample(ALPHA, title_len))
return title
def keyword_gen(n):
kw_list, kw_count = make_keywords(n)
def gen():
for obj in kw_list:
yield obj
return gen()
def make_keywords(n):
"""
make num_keywords
"""
kw_list = []
i = 0
while n > 0:
keyword_len = random.randint(3, 18)
kw = "".join(random.sample(ALPHA, keyword_len))
keyword = MozKeywords(keyword=kw)
keyword.save()
k = MozKeywords.objects.get(keyword=kw)
kw_list.append(k)
n = n - 1
i = i + 1
return (kw_list, i)
def expo(MEAN, round_ceil='ceil'):
"""
returns a value for a exponential dustribution
"""
lambd = 1.0 / MEAN # the lambda value for the random generator
val = random.expovariate(lambd)
if round_ceil == 'ceil':
val = int(ceil(val))
val = int(round(val))
return val
def test_expo(num, MEAN):
"""
test the curvyness
"""
f = []
while num > 0:
f.append(expo(MEAN))
num = num - 1
return f
def frecency():
"""
calculate an exponential frecency value
"""
MEAN = 146 # 146 is our mean value, because ddahl's is 146
lambd = 1.0 / MEAN # the lambda value for the random generator
val = random.expovariate(lambd)
if val < 1:
# flip a coin:
if random.randint(0, 1) == 0:
return 0
else:
return -1
f_ceil = ceil(val)
return f_ceil
def test_frecency(num):
frec_list = []
while num > 0:
frec_list.append(frecency())
num = num - 1
frec_list.sort()
return frec_list
def flip_coin():
return random.randint(0, 1)
def slant(rows, heads):
"""
slant the coin toss to a percentage
"""
results = []
num_heads = round(int(rows * heads))
num_tails = rows - num_heads
while num_heads > 0:
results.append(1)
num_heads = num_heads - 1
while num_tails > 0:
results.append(0)
num_tails = num_tails - 1
return results
def flip_coin_slant(rows, heads):
results = slant(rows, heads)
def fcs():
for r in results:
yield r
return fcs
def last_time_run():
"""
Use get_or_create to make/get a url that keeps track of when this operation was last run on the database
returns date in microseconds of when this place was last updated
"""
my_url = "http://mozilla.com/places_test_placeholder"
my_title = "Places Testing Placeholder"
p, created = place = MozPlaces.objects.get_or_create(url=my_url, title=my_title, rev_host=reverse_host(my_url), visit_count=1, hidden=0, typed=1, frecency=-1)
sql = """select id from moz_places where title = 'Places Testing Placeholder'"""
cursor = connection.cursor()
rows = cursor.execute(sql)
id = rows.fetchall()[0][0]
# insert the latest record into historyvisits
sql = """INSERT INTO moz_historyvisits (from_visit, place_id, visit_date, visit_type, session ) VALUES (0, %s, %s, 1, 0 )""" % (int(id), right_now(),)
cursor.execute(sql)
transaction.commit_unless_managed()
sql = """SELECT visit_date FROM moz_historyvisits WHERE place_id = %s ORDER BY id DESC""" % id
rows = cursor.execute(sql)
_rows = rows.fetchall()
visit_date = _rows[0][0]
return visit_date
def new_place(places_typed_gen):
"""
create a new place
"""
guid = uuid.generate()
parts = url_parts()
my_url = '%s://%s.%s.%s/%s' % (parts['proto'],
parts['host'],
parts['domain'],
parts['tld'],
guid)
if DEBUG:
print my_url
my_title = new_title()
frec = frecency()
try:
place = MozPlaces(
url=my_url,
title=my_title,
rev_host=reverse_host(my_url),
visit_count=expo(8), # using 8 for MEAN for the fun of it for now
hidden=0,
typed=places_typed_gen.next(),
favicon=new_favicon(),
frecency=frec)
place.save()
p = MozPlaces.objects.filter(rev_host=place.rev_host)[0]
return p
except Exception, e:
print e
raise
def new_inputhistory(place):
"""
create an inputhistory record for the place object passed in
"""
input_len = random.randint(3, 20)
ih = "".join(random.sample(ALPHA, input_len))
ih_multi = random.randint(2, 9)
uc = ih_multi * 0.9
inp_hist = MozInputhistory(place_id=place.id, input=ih, use_count=uc)
inp_hist.save()
return inp_hist
def new_history(place, qty=5):
"""
create a history record
"""
rndm_date = random_date()
try:
while qty > 0:
history = MozHistoryvisits.objects.create(
from_visit=0, # other visit this is related to/from/etc
place=place,
visit_date=rndm_date,
visit_type=1,
session=0
)
qty = qty - 1
except:
print "ERROR: cannot create historyvisit for %s" % str(place.id)
raise
def new_bookmark(place, kw_gen):
"""
make a bookmark item
"""
try:
k_obj = kw_gen.next()
except Exception, e:
print str(e)
k_obj = None
try:
bookmark = MozBookmarks.objects.create(
_type=1, # XXX: what should this be?
fk=place.id,
parent=5, # XXX: what is unfiled bookmarks id?
position=-1, # XXX: what should this be?
title=place.title,
folder_type=1, # XXX: what should this be?
dateadded=random_date(),
lastmodified=random_date(),
keyword=k_obj
)
return bookmark
except Exception, e:
print str(e)
print "Err: Could not create bookmark from place: %s" % place.id
def parse_places_stats_obj(obj, source_idx):
"""
Parse and massage the input for the places generator
"""
class Options(object):
def __init__(self, stats_obj, source_idx):
# XXX: check names in json obj
self.source = 'places_stats_site'
self.path = None
self.source_idx = source_idx
src_obj = stats_obj[source_idx]
self.n_places = int(float(stats_obj[source_idx]['moz_places_cnt']))
self.h_mult = int(float(stats_obj[source_idx]['moz_historyvisits_cnt']) / float(stats_obj[source_idx]['moz_places_cnt']))
self.b_mod = int(float(stats_obj[source_idx]['moz_places_cnt']) / float(stats_obj[source_idx]['moz_bookmarks_cnt']))
kw_count = int(float(stats_obj[source_idx]['moz_keywords_cnt']))
bm_count = int(float(stats_obj[source_idx]['moz_bookmarks_cnt']))
self.n_keywords = kw_count
self.n_inputhistory = \
int(float(stats_obj[source_idx]['moz_inputhistory_cnt']))
if DEBUG:
print "########################################################"
print "Creating %s Places" % self.n_places
print "Creating about %s History Visits" % stats_obj[source_idx]['moz_historyvisits_cnt']
print "Creating about %s Bookmarks" % stats_obj[source_idx]['moz_bookmarks_cnt']
print "Creating %s Keywords" % self.n_keywords
print "Creating %s Input History Records" % self.n_inputhistory
print "########################################################"
else:
print "Starting Places DB Generation"
time.sleep(3)
options = Options(obj, source_idx)
return options
def main(source,
source_idx,
num_places,
bookmarks_mod,
history_mult,
n_keywords,
n_inputhistory,
path,
debug):
"""
configure and run the whole operation of creating data in places.sqlite
"""
DEBUG = False
if debug:
DEBUG = True
if path:
os.environ['PLACES_DB_PATH'] = path
counter = 1
if DEBUG:
print "Creating %s Places" % num_places
PERCENT_PLACES_TYPED = 0.3
kw_gen = keyword_gen(n_keywords)
places_typed_gen = flip_coin_slant(num_places, PERCENT_PLACES_TYPED)()
while num_places > 0:
if DEBUG:
print 'Place #%s created' % counter
place = new_place(places_typed_gen) # create a place
if n_inputhistory > 0:
inp_hist = new_inputhistory(place)
n_inputhistory = n_inputhistory - 1
history = new_history(place) # create 5 history records
if num_places % bookmarks_mod == 0:
nb = new_bookmark(place, kw_gen) # create a bookmark
num_places = num_places - 1
counter = counter + 1
# update the time this was last run. The value is stored in a
# moz_historyvisits row as 'visit_date' for the place with the
# uri of "http://mozilla.com/places_test_placeholder'
if DEBUG:
print "Updating last time run timestamp"
try:
last_time_run()
except Exception, e:
print e
raise
if __name__ == '__main__':
parser = OptionParser()
parser.add_option("-p", "--path", dest="path", default=None,
help="Path to SQLite db", metavar="PATH")
parser.add_option("-s", "--source",
dest="source",
default='places_stats_site',
help="'made_up' data or 'places_stats_site' data?",
metavar="SOURCE")
parser.add_option("-i", "--source_idx", dest="source_idx",
default='max',
help="Source object index: avg, min, max",
metavar="SOURCE_IDX")
parser.add_option("-n", "--num-places", dest="n_places", default=300,
help="Number of Places to create",
metavar="NUM_PLACES")
parser.add_option("-b", "--bookmarks-modulus", dest="b_mod", default=4,
help="Bookmarks Modulus, the number of places for which a single bookmark is created",
metavar="BOOKMARKS_MODULUS")
parser.add_option("-k", "--num-keywords", dest="n_keywords", default=15,
help="Number of total Keywords",
metavar="KEYWORDS")
parser.add_option(
"-t", "--input-history", dest="n_inputhistory", default=50,
help="Number of Input History rows",
metavar="INPUT_HISTORY")
parser.add_option("-m", "--history-mult", dest="h_mult", default=5,
help="The number of history items to create for each place", metavar="HISTORY_MULT")
parser.add_option("-v", action="store_true", dest="debug",
default=True, help="Turn DEBUG on")
parser.add_option("-q", action="store_false", dest="debug",
default=False, help="Turn DEBUG off")
(options, args) = parser.parse_args()
if options.source == 'places_stats_site':
cmd_line_options = options
stats_obj = get_web_data()
options = parse_places_stats_obj(stats_obj,
cmd_line_options.source_idx)
options.path = cmd_line_options.path
options.debug = cmd_line_options.debug
KEYWORD_COUNT = 0
KEYWORD_IDS = []
main(options.source,
options.source_idx,
options.n_places,
int(options.b_mod),
int(options.h_mult),
options.n_keywords,
options.n_inputhistory,
options.path,
options.debug)