From 9a250df366b53132c4cffb2d7312e5c62ca12761 Mon Sep 17 00:00:00 2001 From: thekingofcity <1634814279@qq.com> Date: Sat, 26 May 2018 19:04:40 +0800 Subject: [PATCH 1/2] samefollow now store in UserRelation --- db/models.py | 6 +++++- db/tables.py | 5 ++--- page_get/user.py | 5 +++-- page_parse/user/person.py | 44 +++++++++++++++++++++++++++++++++------ page_parse/user/public.py | 1 + 5 files changed, 49 insertions(+), 12 deletions(-) diff --git a/db/models.py b/db/models.py index 3960e00c..8f74c400 100644 --- a/db/models.py +++ b/db/models.py @@ -58,11 +58,15 @@ def __repr__(self): class UserRelation(Base): __table__ = user_relation - def __init__(self, uid, other_id, type, from_where): + def __init__(self, uid, other_id, type, from_where, crawl_time=True): self.user_id = uid self.follow_or_fans_id = other_id self.type = type self.from_where = from_where + if crawl_time: + self.crawl_time = func.now() + else: + self.crawl_time = None def __repr__(self): return 'user_id:{},follow_or_fans_id:{},type:{},from_where:{}'.format(self.user_id, self.follow_or_fans_id, self.type, self.from_where) diff --git a/db/tables.py b/db/tables.py index 95204705..a9d22829 100644 --- a/db/tables.py +++ b/db/tables.py @@ -32,7 +32,6 @@ Column("contact_info", String(300), default='', server_default=''), Column("education_info", String(300), default='', server_default=''), Column("head_img", String(500), default='', server_default=''), - Column("isFan", INTEGER, default=0, server_default='0'), ) # seed ids for user crawling @@ -120,8 +119,8 @@ Column('follow_or_fans_id', String(20)), Column('type', INTEGER), # 1 stands for fans, 2 stands for follows Column('from_where', String(60)), - Column('crawl_time', DateTime(3), default=func.now()) # DATETIME(6) means save 6 digits milliseconds - # time is stored in UTC + Column('crawl_time', DateTime(3)) # DATETIME(6) means save 6 digits milliseconds + # time is stored in UTC ) # dialogue table diff --git a/page_get/user.py b/page_get/user.py index 27a30b84..26c9dd26 100755 --- a/page_get/user.py +++ b/page_get/user.py @@ -63,10 +63,11 @@ def get_url_from_web(user_id): elif domain == '100505': user = get_user_detail(user_id, html) samefollow_uid = get_samefollow_uid() - if samefollow_uid: + if samefollow_uid.strip() != '': + samefollow_uid = samefollow_uid.split(',') url = SAMEFOLLOW_URL.format(user_id) isFanHtml = get_page(url, auth_level=2) - user.isFan = person.get_isFan(isFanHtml, samefollow_uid) + person.get_isFan(isFanHtml, samefollow_uid, user_id) # enterprise or service else: user = get_enterprise_detail(user_id, html) diff --git a/page_parse/user/person.py b/page_parse/user/person.py index 4abb012d..b03f5bda 100755 --- a/page_parse/user/person.py +++ b/page_parse/user/person.py @@ -1,10 +1,12 @@ import re +import json from bs4 import BeautifulSoup from ..user import public from decorators import parse_decorator -from db.models import User +from db.models import (User, UserRelation) +from db.dao import UserRelationOper @parse_decorator(0) @@ -135,18 +137,48 @@ def get_detail(html, uid): @parse_decorator(None) -def get_isFan(html, uid): +def get_isFan(html, uids, current_uid): """ :param html: samefollow page - :param uid : whether this account follows uid + :param uids: list contains uids to determine this account follows or not + :param current_uid: current crawling user :return: 1 for yes 0 for no """ soup = BeautifulSoup(html, "html.parser") scripts = soup.find_all('script') pattern = re.compile(r'FM.view\((.*)\)') + user_ids = list() # Contains uids that the user and crawler both follow + intersection_ids = list() # Contains the intersection of param uids and user_ids + relations = list() # Contains list to be stored in UserRelation table for script in scripts: m = pattern.search(script.string) - if m and uid in script.string: - return 1 - return 0 + # Find the