Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 110 additions & 30 deletions qa-engine/convert_to_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,16 @@
import sqlite3
from django.utils.encoding import smart_str, smart_unicode
import gc
import os

now = datetime.now()
FILES = ('Users.xml', 'Posts.xml', 'Votes.xml')
FILES = ('users.xml', 'posts.xml', 'votes.xml', 'comments.xml')
MAX_VALUES = 250
msstrip = re.compile(r'^(.*)\.\d+')

OSQA_DB_USERNAME = 'osqa'
OSQA_DB_PASSWORD = 'osqapass'
OSQA_DB_NAME = 'test'
OSQA_DB_NAME = 'osqa'


def escape(string):
Expand All @@ -41,10 +42,10 @@ def readTime(ts):
except ValueError:
return datetime(*time.strptime(ts, '%Y-%m-%d')[0:3])


def getFilePath(name):
root_dir = os.path.dirname(os.path.realpath(__file__))
return os.path.join(root_dir, name)
# Not need this in batch_process
#def getFilePath(name):
# root_dir = os.path.dirname(os.path.realpath(__file__))
# return os.path.join(root_dir, name)


def writew(f, header, itervalues, func, check=lambda x: True, on_duplicate=None):
Expand All @@ -68,7 +69,7 @@ class Writer():

def __init__(self, count, name):
self.count = count
self.FILES = [open(getFilePath("%s-%s.sql" % (name, i)), "w") for i in xrange(count)]
self.FILES = [open(("%s-%s.sql" % (name, i)), "w") for i in xrange(count)]

def write(self, string, is_header=False):
if is_header:
Expand All @@ -94,14 +95,14 @@ class PostsConverter():
revision_id = 0
parent_is_not_available = {}
header = """INSERT INTO forum_node
(id, title, tagnames, author_id, body, node_type, parent_id,
added_at, score, state_string, last_activity_by_id, last_activity_at,
active_revision_id, extra_count, marked) VALUES """.encode("utf-8")
(id, title, tagnames, author_id, body, node_type, parent_id,
added_at, score, state_string, last_activity_by_id, last_activity_at,
active_revision_id, extra_count, marked) VALUES """.encode("utf-8")

def __init__(self, usermap):
self.usermap = usermap

self.con = sqlite3.connect("/tmp/sqlite.db")
self.con = sqlite3.connect("/home/syan3/test_convert/result/batchimport/foo4.db")
cur = self.con.cursor()
cur.execute("CREATE TABLE IF NOT EXISTS tags " +
"(id INT, name TEXT UNIQUE, used_count INT, updated INT, PRIMARY KEY(id ASC))")
Expand Down Expand Up @@ -154,7 +155,7 @@ def create_and_activate_revision(self, title, owner_id, creation_date, body):
cur = self.con.cursor()
self.revision_id += 1
values = u"(%d, '%s','',%d,'%s',%s,'Initial revision',1,'%s')" % (
self.revision_id, title, owner_id, '', # writing empty body
self.revision_id, title, owner_id, '', # writing empty body
self.current_post_id, readTime(creation_date) )
cur.execute("INSERT INTO revisions VALUES(?)", (values,) )
return self.revision_id
Expand Down Expand Up @@ -185,8 +186,8 @@ def make_sql(self, obj):

def finalize(self):
tags_header = """INSERT INTO forum_tag
(id, name, created_by_id, created_at, used_count) VALUES """
f = open(getFilePath("posts-misc.sql"), "w")
(id, name, created_by_id, created_at, used_count) VALUES """
f = open(("posts-misc.sql"), "w")
cur = self.con.cursor()

writew(f, tags_header, cur.execute("SELECT id, name, used_count FROM tags WHERE updated=1"),
Expand All @@ -196,8 +197,8 @@ def finalize(self):
nodetags_header = u"INSERT INTO forum_node_tags(node_id,tag_id) VALUES "
writew(f, nodetags_header, self.nodetags, lambda x: u"(%s,%s)" % x)
revisions_header = """INSERT INTO forum_noderevision
(id, title, tagnames, author_id, body, node_id,
summary, revision, revised_at) VALUES """
(id, title, tagnames, author_id, body, node_id,
summary, revision, revised_at) VALUES """

writew(f, revisions_header, cur.execute("SELECT data FROM revisions"), lambda x: x[0])

Expand Down Expand Up @@ -240,14 +241,14 @@ def convert(self, context, files_count):

class UsersConverter():
header_auth = """INSERT INTO auth_user
(id, username, email, password, is_active, date_joined)
VALUES """
(id, username, email, password, is_active, date_joined)
VALUES """
header_forum = """INSERT INTO forum_user
(user_ptr_id, last_seen, about, website,
reputation, gold, silver, bronze, real_name, location)
VALUES """
(user_ptr_id, last_seen, about, website,
reputation, gold, silver, bronze, real_name, location)
VALUES """
usernames = set()
existing_users = {} # {email: id}
existing_users = {} # {email: id}
usermap = {}
last_user_id = 0

Expand All @@ -268,7 +269,7 @@ def __init__(self):

def make_sql_forum(self, obj):
return u"(%d, '%s', '%s', '%s', %s, 0, 0, 0, '%s', '%s')" % (
self.current_id, readTime(obj.get('LastAccessDate')),
self.current_id, readTime(obj.get('LastAccessDate')),
escape(obj.get('AboutMe', '')),
escape(obj.get('WebSiteUrl', '')),
obj['Reputation'], escape(obj.get('RealName', '')[:30]),
Expand All @@ -278,7 +279,11 @@ def make_sql_forum(self, obj):
def make_sql_auth(self, obj):
name = escape(obj['DisplayName'].strip())
if name in self.usernames:
name = name + obj['Id']
# name = name + obj['Id']
suffix = 1
while "%s%d" %(name,suffix) in self.usernames:
suffix +=1
name = "%s%d" %(name,suffix)
self.usernames.add(name)
return u"(%d, '%s', '%s', '!', 1, '%s')" % (
self.current_id, name, obj['EmailHash'],
Expand Down Expand Up @@ -341,11 +346,11 @@ class VotesConverter():
action_id = 1

action_header = """INSERT INTO forum_action (id, user_id, node_id,
action_type, action_date) VALUES """
action_type, action_date) VALUES """
#actionrepute_header = """INSERT INTO forum_actionrepute (action_id,
# date, user_id, value) VALUES """
# date, user_id, value) VALUES """
vote_header = """INSERT IGNORE INTO forum_vote (user_id, node_id, value,
action_id, voted_at) VALUES """
action_id, voted_at) VALUES """

def __init__(self, usermap, postsmap):
self.usermap = usermap
Expand All @@ -370,7 +375,7 @@ def make_sql(self, obj):
self.action_id, creation_date
)
sql_action = u"(%d, %d, %d, '%s', '%s')" % (
self.action_id, user_id, post_id,
self.action_id+1, user_id, post_id,
self.get_action(obj['VoteTypeId']), creation_date,
)

Expand Down Expand Up @@ -407,30 +412,105 @@ def convert(self, context, files_count):
writer_action.close()




class CommentsConverter():

header = """INSERT INTO forum_node
(id, author_id, body, node_type, parent_id,
added_at, score) VALUES """.encode("utf-8")
def __init__(self, usermap, postsmap):
self.usermap = usermap
self.postsmap = postsmap
con = MySQLdb.connect('localhost', OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME)
with con:
self.node_id = get_last_id(con.cursor(), "forum_node")

def make_sql(self, obj):
if int(obj['PostId']) not in self.postsmap:
return None
user_id = self.usermap.get(int(obj['UserId']), 1) if 'UserId' in obj else 1
post_id = self.postsmap[int(obj['PostId'])]
return u"(%d, %d, '%s', 'comment', %s, '%s', %s)" % (
self.node_id+1,
user_id,
escape(obj.get('Text', '')),
post_id,
readTime(obj.get('CreationDate')),
escape(obj.get('Score', 1)))


def convert(self, context, files_count):
files_count = (files_count // 2) or 1
writer_comment = Writer(files_count, "comment")
counter = 0
mod = MAX_VALUES * files_count
for event, elem in context:
values_comment = self.make_sql(elem.attrib)
if values_comment == None:
continue
#print values_comment
#values_comment = 'test tool'
if counter % mod == 0:
writer_comment.write(u';\n' + self.header, True)

if counter % mod < files_count:
writer_comment.write(values_comment.encode('utf-8'))
else:
writer_comment.write((u',\n' + values_comment).encode('utf-8'))

counter += 1
self.node_id += 1
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
writer_comment.close()









def convert(path, files_count):
print "Processing 'users.xml'"
usersConverter = UsersConverter()
context = etree.iterparse(os.path.join(path, FILES[0]), events=('end',), tag='row')
usermap = usersConverter.convert(context, files_count)
os.system('mysql -u %s -p%s %s< auth_user-0.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) )
os.system('mysql -u %s -p%s %s< forum_user-0.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) )
del context

print "Processing 'posts.xml'"
postsConverter = PostsConverter(usermap)
context = etree.iterparse(os.path.join(path, FILES[1]), events=('end',), tag='row')
postsmap = postsConverter.convert(context, files_count)
os.system('mysql -u %s -p%s %s< posts-0.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) )
os.system('mysql -u %s -p%s %s< posts-misc.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) )
del context

print "Processing 'votes.xml'"
votesConverter = VotesConverter(usermap, postsmap)
context = etree.iterparse(os.path.join(path, FILES[2]), events=('end',), tag='row')
votesConverter.convert(context, files_count)
os.system('mysql -u %s -p%s %s< forum_action-0.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) )
os.system('mysql -u %s -p%s %s< forum_vote-0.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) )
del context

print "Processing 'comments.xml'"
commentsConverter = CommentsConverter(usermap, postsmap)
context = etree.iterparse(os.path.join(path, FILES[3]), events=('end',), tag='row')
commentsConverter.convert(context, files_count)
os.system('mysql -u %s -p%s %s< comment-0.sql' % (OSQA_DB_USERNAME, OSQA_DB_PASSWORD, OSQA_DB_NAME) )
del context

if __name__ == "__main__":
if len(sys.argv) < 2:
print "Please provide path to the directory with:"
print FILES
else:
files_count = int(sys.argv[2]) if len(sys.argv) > 2 else 6
#files_count = int(sys.argv[2]) if len(sys.argv) > 2 else 6
files_count = 1
convert(sys.argv[1], files_count)