diff --git a/test.py b/test.py index c7a9ceb..c291f20 100755 --- a/test.py +++ b/test.py @@ -87,6 +87,9 @@ def answer_test(answer_url): answer.to_txt() # 把答案输出为markdown文件 answer.to_md() + #该回答下的所有评论 + all_comments = answer.get_comments() + print question # @@ -101,6 +104,11 @@ def answer_test(answer_url): print visit_times # 输出: 改答案所属问题被浏览次数 + # 输出: 所有答主在该问题下的评论 + for c in all_comments : + if c.get_answer_author_flag(): + print c.get_content() + def user_test(user_url): user = User(user_url) # 获取用户ID @@ -282,6 +290,7 @@ def main(): question_test(url) answer_url = "http://www.zhihu.com/question/24269892/answer/29960616" answer_test(answer_url) + user_url = "http://www.zhihu.com/people/jixin" user_test(user_url) collection_url = "http://www.zhihu.com/collection/36750683" @@ -293,6 +302,7 @@ def main(): test() + if __name__ == '__main__': main() diff --git a/zhihu.py b/zhihu.py index 222eaea..6312a19 100755 --- a/zhihu.py +++ b/zhihu.py @@ -490,6 +490,7 @@ def get_visit_times(self): return int(soup.find("meta", itemprop="visitsCount")["content"]) + class User: user_url = None # session = None @@ -1173,6 +1174,27 @@ def get_voters(self): voter_id = voter_info.a["title"].encode("utf-8") yield User(voter_url, voter_id) + def get_comments(self): + if self.soup == None: + self.parser() + soup = self.soup + + try: + data_aid = soup.find("div", {"class":lambda x : x and "zm-item-answer" in x.split()})["data-aid"] + request_url = 'http://www.zhihu.com/node/AnswerCommentListV2' + + r = requests.get(request_url, params={"params": "{\"answer_id\":\"%d\"}" % int(data_aid)}) + soup = BeautifulSoup(r.content, "lxml") + comments = soup.findAll("div",{"class":"zm-item-comment"}) + + if len(comments) == 0: + return + yield + else: + for comment in comments: + yield Comment(comment["data-id"],comment) + except TypeError as err: + print 'type error in get comments' class Collection: url = None @@ -1293,3 +1315,54 @@ def get_top_i_answers(self, n): if j > n: break yield answer + + +class Comment: + comment_id = None + soup = None + + def setFlag(self, input): + if (u"提问者" in input): + self.question_author_flag = True + if (u"作者" in input): + self.answer_author_flag = True + + def parser(self): + soup = self.soup + commenthddiv = soup.find("div",{"class":"zm-comment-hd"}) + + if (commenthddiv.contents[0].strip() == u"匿名用户"): + self.author = User(None, u"匿名用户") + self.setFlag(commenthddiv.contents[1].string) + else: + apart = commenthddiv.find("a", {"class":"zg-link"}) + if (apart is not None): + self.author = User(apart['href'], apart.string) + self.setFlag(apart.next_sibling.string) + + self.content = (" ".join(soup.find("div",{"class":"zm-comment-content"}).stripped_strings)) + + def __init__(self, comment_id, soup): + self.comment_id = comment_id + self.soup = soup + self.question_author_flag = False + self.answer_author_flag = False + self.parser() + + def get_author(self): + return self.author + + def get_content(self): + content = self.content + if platform.system() == 'Windows': + content = content.decode('utf-8').encode('gbk') + return content + else: + return content + #是否提问者 + def get_question_author_flag(self): + return self.question_author_flag + + #是否答案作者 + def get_answer_author_flag(self): + return self.answer_author_flag \ No newline at end of file