python - Output separated HBase columns using happybase -
i have such hbase-table:
total date1:tcount1 date2:tcount2 ... url1 date1:clickcount1 date2:clickcount2 ... url2 date1:clickcount1 date2:clickcount2 ... ...
url1, url2, ...
row keys. table has only one column family.
i have date range (from datei
datej
) input. need output shares of clicks in day each url.
the output must have such format:
datei url1:share1 url2:share1... ... datej url1:share1 url2:share1...
where
datei.url1:share1 = url1.datei:clickcount1 / total datei:tcount1
i started write happybase-script, don't know, how select separate columns row using happybase. happybase-script below:
import argparse import calendar import getpass import happybase import logging import random import sys usage = """ query daily data year, run: $ {0} --action query --year 2014 query daily data particular month, run: $ {0} --action query --year 2014 --month 10 query daily data particular day, run: $ {0} --action query --year 2014 --month 10 --day 27 compute totals add `--total` argument. """.format(sys.argv[0]) logging.basicconfig(level="debug") hosts = ["bds%02d.vdi.mipt.ru" % in xrange(7, 10)] table = "visitcountpy-" + getpass.getuser() def connect(): host = random.choice(hosts) conn = happybase.connection(host) logging.debug("connecting hbase thrift server on %s", host) conn.open() if table not in conn.tables(): # create table column family `cf` default settings. conn.create_table(table, {"cf": dict()}) logging.debug("created table %s", table) else: logging.debug("using table %s", table) return happybase.table(table, conn) def query(args, table): r = list(get_time_range(args)) t = 0l key, data in table.scan(row_start=min(r), row_stop=max(r)): if args.total: t += long(data["cf:value"]) else: print "%s\t%s" % (key, data["cf:value"]) if args.total: print "total\t%s" % t def get_time_range(args): cal = calendar.calendar() years = [args.year] months = [args.month] if args.month not none else range(1, 1+12) year in years: month in months: if args.day not none: days = [args.day] else: days = cal.itermonthdays(year, month) day in days: if day > 0: yield "%04d%02d%02d" % (year, month, day) def main(): parser = argparse.argumentparser(description="an hbase example", usage=usage) parser.add_argument("--action", metavar="action", choices=("generate", "query"), required=true) parser.add_argument("--year", type=int, required=true) parser.add_argument("--month", type=int, default=none) parser.add_argument("--day", type=int, default=none) parser.add_argument("--total", action="store_true", default=false) args = parser.parse_args() table = connect() if args.day not none , args.month none: raise runtimeerror("please, specify month when specifying day.") if args.day not none , (args.day < 0 or args.day > 31): raise runtimeerror("please, specify valid day.") query(args, table) if __name__ == "__main__": main()
so, how should change script (actually, query()
function) separated columns in defined date range?
i think should use scanner filter, can provide string (which interpreted @ server) through scan(filter=...)
argument.
see https://github.com/wbolster/happybase/issues/11 pointers (examples, docs).
Comments
Post a Comment