diff --git a/qa-engine/convert_to_sql.py b/qa-engine/convert_to_sql.py index e603622..7f1cf88 100644 --- a/qa-engine/convert_to_sql.py +++ b/qa-engine/convert_to_sql.py @@ -10,15 +10,16 @@ import sqlite3 from django.utils.encoding import smart_str, smart_unicode import gc +import os now = datetime.now() -FILES = ('Users.xml', 'Posts.xml', 'Votes.xml') +FILES = ('users.xml', 'posts.xml', 'votes.xml', 'comments.xml') MAX_VALUES = 250 msstrip = re.compile(r'^(.*)\.\d+') OSQA_DB_USERNAME = 'osqa' OSQA_DB_PASSWORD = 'osqapass' -OSQA_DB_NAME = 'test' +OSQA_DB_NAME = 'osqa' def escape(string): @@ -41,10 +42,10 @@ def readTime(ts): except ValueError: return datetime(*time.strptime(ts, '%Y-%m-%d')[0:3]) - -def getFilePath(name): - root_dir = os.path.dirname(os.path.realpath(__file__)) - return os.path.join(root_dir, name) +# Not need this in batch_process +#def getFilePath(name): +# root_dir = os.path.dirname(os.path.realpath(__file__)) +# return os.path.join(root_dir, name) def writew(f, header, itervalues, func, check=lambda x: True, on_duplicate=None): @@ -68,7 +69,7 @@ class Writer(): def __init__(self, count, name): self.count = count - self.FILES = [open(getFilePath("%s-%s.sql" % (name, i)), "w") for i in xrange(count)] + self.FILES = [open(("%s-%s.sql" % (name, i)), "w") for i in xrange(count)] def write(self, string, is_header=False): if is_header: @@ -94,14 +95,14 @@ class PostsConverter(): revision_id = 0 parent_is_not_available = {} header = """INSERT INTO forum_node - (id, title, tagnames, author_id, body, node_type, parent_id, - added_at, score, state_string, last_activity_by_id, last_activity_at, - active_revision_id, extra_count, marked) VALUES """.encode("utf-8") +(id, title, tagnames, author_id, body, node_type, parent_id, +added_at, score, state_string, last_activity_by_id, last_activity_at, +active_revision_id, extra_count, marked) VALUES """.encode("utf-8") def __init__(self, usermap): self.usermap = usermap - self.con = sqlite3.connect("/tmp/sqlite.db") + self.con = sqlite3.connect("/home/syan3/test_convert/result/batchimport/foo4.db") cur = self.con.cursor() cur.execute("CREATE TABLE IF NOT EXISTS tags " + "(id INT, name TEXT UNIQUE, used_count INT, updated INT, PRIMARY KEY(id ASC))") @@ -154,7 +155,7 @@ def create_and_activate_revision(self, title, owner_id, creation_date, body): cur = self.con.cursor() self.revision_id += 1 values = u"(%d, '%s','',%d,'%s',%s,'Initial revision',1,'%s')" % ( - self.revision_id, title, owner_id, '', # writing empty body + self.revision_id, title, owner_id, '', # writing empty body self.current_post_id, readTime(creation_date) ) cur.execute("INSERT INTO revisions VALUES(?)", (values,) ) return self.revision_id @@ -185,8 +186,8 @@ def make_sql(self, obj): def finalize(self): tags_header = """INSERT INTO forum_tag - (id, name, created_by_id, created_at, used_count) VALUES """ - f = open(getFilePath("posts-misc.sql"), "w") +(id, name, created_by_id, created_at, used_count) VALUES """ + f = open(("posts-misc.sql"), "w") cur = self.con.cursor() writew(f, tags_header, cur.execute("SELECT id, name, used_count FROM tags WHERE updated=1"), @@ -196,8 +197,8 @@ def finalize(self): nodetags_header = u"INSERT INTO forum_node_tags(node_id,tag_id) VALUES " writew(f, nodetags_header, self.nodetags, lambda x: u"(%s,%s)" % x) revisions_header = """INSERT INTO forum_noderevision - (id, title, tagnames, author_id, body, node_id, - summary, revision, revised_at) VALUES """ +(id, title, tagnames, author_id, body, node_id, +summary, revision, revised_at) VALUES """ writew(f, revisions_header, cur.execute("SELECT data FROM revisions"), lambda x: x[0]) @@ -240,14 +241,14 @@ def convert(self, context, files_count): class UsersConverter(): header_auth = """INSERT INTO auth_user - (id, username, email, password, is_active, date_joined) - VALUES """ +(id, username, email, password, is_active, date_joined) +VALUES """ header_forum = """INSERT INTO forum_user - (user_ptr_id, last_seen, about, website, - reputation, gold, silver, bronze, real_name, location) - VALUES """ +(user_ptr_id, last_seen, about, website, +reputation, gold, silver, bronze, real_name, location) +VALUES """ usernames = set() - existing_users = {} # {email: id} + existing_users = {} # {email: id} usermap = {} last_user_id = 0 @@ -268,7 +269,7 @@ def __init__(self): def make_sql_forum(self, obj): return u"(%d, '%s', '%s', '%s', %s, 0, 0, 0, '%s', '%s')" % ( - self.current_id, readTime(obj.get('LastAccessDate')), + self.current_id, readTime(obj.get('LastAccessDate')), escape(obj.get('AboutMe', '')), escape(obj.get('WebSiteUrl', '')), obj['Reputation'], escape(obj.get('RealName', '')[:30]), @@ -278,7 +279,11 @@ def make_sql_forum(self, obj): def make_sql_auth(self, obj): name = escape(obj['DisplayName'].strip()) if name in self.usernames: - name = name + obj['Id'] + # name = name + obj['Id'] + suffix = 1 + while "%s%d" %(name,suffix) in self.usernames: + suffix +=1 + name = "%s%d" %(name,suffix) self.usernames.add(name) return u"(%d, '%s', '%s', '!', 1, '%s')" % ( self.current_id, name, obj['EmailHash'], @@ -341,11 +346,11 @@ class VotesConverter(): action_id = 1 action_header = """INSERT INTO forum_action (id, user_id, node_id, - action_type, action_date) VALUES """ +action_type, action_date) VALUES """ #actionrepute_header = """INSERT INTO forum_actionrepute (action_id, - # date, user_id, value) VALUES """ + # date, user_id, value) VALUES """ vote_header = """INSERT IGNORE INTO forum_vote (user_id, node_id, value, - action_id, voted_at) VALUES """ +action_id, voted_at) VALUES """ def __init__(self, usermap, postsmap): self.usermap = usermap @@ -370,7 +375,7 @@ def make_sql(self, obj): self.action_id, creation_date ) sql_action = u"(%d, %d, %d, '%s', '%s')" % ( - self.action_id, user_id, post_id, + self.action_id+1, user_id, post_id, self.get_action(obj['VoteTypeId']), creation_date, ) @@ -407,30 +412,105 @@ def convert(self, context, files_count): writer_action.close() + + +class CommentsConverter(): + + header = """INSERT INTO forum_node +(id, author_id, body, node_type, parent_id, +added_at, score) VALUES """.encode("utf-8") + def __init__(self, usermap, postsmap): + self.usermap = usermap + self.postsmap = postsmap + con = MySQLdb.connect('localhost', OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) + with con: + self.node_id = get_last_id(con.cursor(), "forum_node") + + def make_sql(self, obj): + if int(obj['PostId']) not in self.postsmap: + return None + user_id = self.usermap.get(int(obj['UserId']), 1) if 'UserId' in obj else 1 + post_id = self.postsmap[int(obj['PostId'])] + return u"(%d, %d, '%s', 'comment', %s, '%s', %s)" % ( + self.node_id+1, + user_id, + escape(obj.get('Text', '')), + post_id, + readTime(obj.get('CreationDate')), + escape(obj.get('Score', 1))) + + + def convert(self, context, files_count): + files_count = (files_count // 2) or 1 + writer_comment = Writer(files_count, "comment") + counter = 0 + mod = MAX_VALUES * files_count + for event, elem in context: + values_comment = self.make_sql(elem.attrib) + if values_comment == None: + continue + #print values_comment + #values_comment = 'test tool' + if counter % mod == 0: + writer_comment.write(u';\n' + self.header, True) + + if counter % mod < files_count: + writer_comment.write(values_comment.encode('utf-8')) + else: + writer_comment.write((u',\n' + values_comment).encode('utf-8')) + + counter += 1 + self.node_id += 1 + elem.clear() + while elem.getprevious() is not None: + del elem.getparent()[0] + writer_comment.close() + + + + + + + + + def convert(path, files_count): print "Processing 'users.xml'" usersConverter = UsersConverter() context = etree.iterparse(os.path.join(path, FILES[0]), events=('end',), tag='row') usermap = usersConverter.convert(context, files_count) + os.system('mysql -u %s -p%s %s< auth_user-0.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) ) + os.system('mysql -u %s -p%s %s< forum_user-0.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) ) del context - + print "Processing 'posts.xml'" postsConverter = PostsConverter(usermap) context = etree.iterparse(os.path.join(path, FILES[1]), events=('end',), tag='row') postsmap = postsConverter.convert(context, files_count) + os.system('mysql -u %s -p%s %s< posts-0.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) ) + os.system('mysql -u %s -p%s %s< posts-misc.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) ) del context print "Processing 'votes.xml'" votesConverter = VotesConverter(usermap, postsmap) context = etree.iterparse(os.path.join(path, FILES[2]), events=('end',), tag='row') votesConverter.convert(context, files_count) + os.system('mysql -u %s -p%s %s< forum_action-0.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) ) + os.system('mysql -u %s -p%s %s< forum_vote-0.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) ) del context + print "Processing 'comments.xml'" + commentsConverter = CommentsConverter(usermap, postsmap) + context = etree.iterparse(os.path.join(path, FILES[3]), events=('end',), tag='row') + commentsConverter.convert(context, files_count) + os.system('mysql -u %s -p%s %s< comment-0.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) ) + del context if __name__ == "__main__": if len(sys.argv) < 2: print "Please provide path to the directory with:" print FILES else: - files_count = int(sys.argv[2]) if len(sys.argv) > 2 else 6 + #files_count = int(sys.argv[2]) if len(sys.argv) > 2 else 6 + files_count = 1 convert(sys.argv[1], files_count)