diff --git a/apps/common/src/python/mediawords/util/mail.py b/apps/common/src/python/mediawords/util/mail.py index 3e75702f6d..939676bbf2 100644 --- a/apps/common/src/python/mediawords/util/mail.py +++ b/apps/common/src/python/mediawords/util/mail.py @@ -13,6 +13,8 @@ # Environment variable that, when set, will prevent the package from actually sending the email __ENV_MAIL_DO_NO_SEND = 'MEDIACLOUD_MAIL_DO_NOT_SEND' +# queue a list of test messages sent for validation +_sent_test_messages = [] class McSendEmailException(Exception): """send_email() exception.""" @@ -27,6 +29,10 @@ def disable_test_mode(): del os.environ[__ENV_MAIL_DO_NO_SEND] +def sent_test_messages(): + return _sent_test_messages + + def test_mode_is_enabled() -> bool: return __ENV_MAIL_DO_NO_SEND in os.environ @@ -123,6 +129,7 @@ def send_email(message: Message) -> bool: mime_message.attach(message_part) if test_mode_is_enabled(): + _sent_test_messages.append(message) log.info("Test mode is enabled, not actually sending any email.") log.debug("Omitted email:\n\n%s" % mime_message.as_string()) diff --git a/apps/common/src/python/mediawords/util/url/__init__.py b/apps/common/src/python/mediawords/util/url/__init__.py index 531bc26508..1f82207c5c 100644 --- a/apps/common/src/python/mediawords/util/url/__init__.py +++ b/apps/common/src/python/mediawords/util/url/__init__.py @@ -178,7 +178,7 @@ def normalize_url(url: str) -> str: url = fix_common_url_mistakes(url) try: - url = canonical_url(url) + url = canonical_url(url) except Exception as ex: raise McNormalizeURLException("Unable to get canonical URL: %s" % str(ex)) diff --git a/apps/common/tests/python/mediawords/util/test_mail.py b/apps/common/tests/python/mediawords/util/test_mail.py index 11415e26c3..cbc2f79843 100644 --- a/apps/common/tests/python/mediawords/util/test_mail.py +++ b/apps/common/tests/python/mediawords/util/test_mail.py @@ -4,6 +4,7 @@ Message, send_email, send_text_email, + sent_test_messages, enable_test_mode as enable_mail_test_mode, disable_test_mode as disable_mail_test_mode, ) @@ -29,6 +30,10 @@ def test_send_mail(self): ) assert send_email(message) + sent_message = sent_test_messages().pop() + + assert sent_message == message + def test_send_text_email(self): assert send_text_email( to='nowhere@mediacloud.org', diff --git a/apps/tools/bin/dev/jumpstart_perl_to_python.pl b/apps/tools/bin/dev/jumpstart_perl_to_python.pl index c59879d842..4dcce4317e 100755 --- a/apps/tools/bin/dev/jumpstart_perl_to_python.pl +++ b/apps/tools/bin/dev/jumpstart_perl_to_python.pl @@ -119,6 +119,12 @@ sub main # eq -> == $code =~ s/ eq / == /g; + # undef to None + $code =~ s/undef/None/g; + + # add paerns to common db methods + $code =~ s/(hash(es)?|flat)$/$1()/; + print $code; } diff --git a/apps/topics-base/src/python/topics_base/alert.py b/apps/topics-base/src/python/topics_base/alert.py new file mode 100644 index 0000000000..6472429676 --- /dev/null +++ b/apps/topics-base/src/python/topics_base/alert.py @@ -0,0 +1,33 @@ +from mediawords.util.log import create_logger +log = create_logger(__name__) + +import mediawords.util.mail +import topics_base.config +import topics_base.messages + +def send_topic_alert(db, topic, message): + """ send an alert about significant activity on the topic to all users with at least write access to the topic""" + + emails = db.query( + """ + select distinct au.email + from auth_users au + join topic_permissions tp using (auth_users_id) + where + tp.permission in ('admin', 'write') and + tp.topics_id = %(a)s + """, + {'a': topic['topics_id']}).flat() + + emails.extend(topics_base.config.TopicsBaseConfig.topic_alert_emails()) + + emails = set(emails) + + for email in emails: + message = topics_base.messages.TopicSpiderUpdateMessage( + to=email, + topic_name=topic['name'], + topic_url="https://topics.mediacloud.org/#/topics/topic['topics_id']/summary", + topic_spider_status=message, + ) + mediawords.util.mail.send_email(message) diff --git a/apps/topics-base/tests/python/test_alert.py b/apps/topics-base/tests/python/test_alert.py new file mode 100644 index 0000000000..e075dd7d2b --- /dev/null +++ b/apps/topics-base/tests/python/test_alert.py @@ -0,0 +1,53 @@ +import hashlib + +from mediawords.db import connect_to_db +import mediawords.test.db.create +import mediawords.util.mail +import topics_base.alert +from topics_base.config import TopicsBaseConfig + +from mediawords.util.log import create_logger + +log = create_logger(__name__) + +def _create_permission(db, topic, permission): + au = { + 'email': f'{permission}@bar.com', + 'password_hash': 'x' * 137, + 'full_name': 'foo bar'} + au = db.create('auth_users', au) + + tp = { + 'topics_id': topic['topics_id'], + 'auth_users_id': au['auth_users_id'], + 'permission': permission} + tp = db.create('topic_permissions', tp) + + return au + + +def test_topic_alert(): + db = mediawords.db.connect_to_db() + + topic = mediawords.test.db.create.create_test_topic(db, 'test') + + au_admin = _create_permission(db, topic, 'admin') + au_read = _create_permission(db, topic, 'read') + au_write = _create_permission(db, topic, 'write') + + mediawords.util.mail.enable_test_mode() + + test_message = 'foobarbat' + + topics_base.alert.send_topic_alert(db, topic, test_message) + + sent_mails = mediawords.util.mail.sent_test_messages() + + expected_emails = [au['email'] for au in (au_admin, au_write)] + TopicsBaseConfig.topic_alert_emails() + got_emails = [m.to[0] for m in sent_mails] + + assert len(sent_mails) == len(expected_emails) + + assert set(got_emails) == set(expected_emails) + + diff --git a/apps/topics-extract-story-links/src/python/topics_extract_story_links/extract_story_links.py b/apps/topics-extract-story-links/src/python/topics_extract_story_links/extract_story_links.py index 949643f11f..23d154ba83 100644 --- a/apps/topics-extract-story-links/src/python/topics_extract_story_links/extract_story_links.py +++ b/apps/topics-extract-story-links/src/python/topics_extract_story_links/extract_story_links.py @@ -73,6 +73,9 @@ def _get_youtube_embed_links(db: DatabaseHandler, story: dict) -> List[str]: "select * from downloads where stories_id = %(a)s order by stories_id limit 1", {'a': story['stories_id']}).hash() + if not download: + return [] + html = fetch_content(db, download) soup = BeautifulSoup(html, 'lxml') diff --git a/apps/topics-mine-public/Dockerfile b/apps/topics-mine-public/Dockerfile index 0735388f3e..d403c3d309 100644 --- a/apps/topics-mine-public/Dockerfile +++ b/apps/topics-mine-public/Dockerfile @@ -9,4 +9,4 @@ COPY bin /opt/mediacloud/bin USER mediacloud -CMD ["topics_mine_public_worker.pl"] +CMD ["topics_mine_public_worker.py"] diff --git a/apps/topics-mine-public/bin/topics_mine_public_worker.pl b/apps/topics-mine-public/bin/topics_mine_public_worker.pl deleted file mode 100755 index 724e8cd641..0000000000 --- a/apps/topics-mine-public/bin/topics_mine_public_worker.pl +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env perl -# -# This job is a copy of MineTopic but is used to run a separate job queue for topics requested by public users. -# - -use strict; -use warnings; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use MediaWords::TM::Worker; - - -sub main() -{ - MediaWords::TM::Worker::start_topics_mine_worker( 'MediaWords::Job::TM::MineTopicPublic' ); -} - -main(); diff --git a/apps/topics-mine-public/bin/topics_mine_public_worker.py b/apps/topics-mine-public/bin/topics_mine_public_worker.py new file mode 100755 index 0000000000..ac8b58b8c0 --- /dev/null +++ b/apps/topics-mine-public/bin/topics_mine_public_worker.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +from mediawords.job import JobBroker +from mediawords.util.log import create_logger +from topics_mine.mine import run_worker_job + +log = create_logger(__name__) + +QUEUE_NAME = 'MediaWords::Job::TM::MineTopicPublic' + +if __name__ == '__main__': + app = JobBroker(queue_name=QUEUE_NAME) + app.start_worker(handler=run_worker_job) diff --git a/apps/topics-mine/Dockerfile b/apps/topics-mine/Dockerfile index 16296056cc..4df95fedaf 100644 --- a/apps/topics-mine/Dockerfile +++ b/apps/topics-mine/Dockerfile @@ -4,23 +4,22 @@ FROM gcr.io/mcback/topics-base:latest -# Install Perl dependencies -COPY src/cpanfile /var/tmp/ +# Install Python dependencies +COPY src/requirements.txt /var/tmp/ RUN \ cd /var/tmp/ && \ - cpm install --global --resolver 02packages --no-prebuilt --mirror "$MC_PERL_CPAN_MIRROR" && \ - rm cpanfile && \ - rm -rf /root/.perl-cpm/ && \ + pip3 install -r requirements.txt && \ + rm requirements.txt && \ + rm -rf /root/.cache/ && \ true # Copy sources COPY src/ /opt/mediacloud/src/topics-mine/ -ENV PERL5LIB="/opt/mediacloud/src/topics-mine/perl:${PERL5LIB}" \ - PYTHONPATH="/opt/mediacloud/src/topics-mine/python:${PYTHONPATH}" +ENV PYTHONPATH="/opt/mediacloud/src/topics-mine/python:${PYTHONPATH}" # Copy worker script COPY bin /opt/mediacloud/bin USER mediacloud -CMD ["topics_mine_worker.pl"] +CMD ["topics_mine_worker.py"] diff --git a/apps/topics-mine/bin/mine_topic.pl b/apps/topics-mine/bin/mine_topic.pl deleted file mode 100755 index 14c275fe35..0000000000 --- a/apps/topics-mine/bin/mine_topic.pl +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use Getopt::Long; - -use MediaWords::DB; -use MediaWords::TM::CLI; -use MediaWords::TM::Mine; - -sub main -{ - my ( $topic_opt, $import_only, $skip_post_processing, $snapshots_id, $resume_snapshot ); - - binmode( STDOUT, 'utf8' ); - binmode( STDERR, 'utf8' ); - - $| = 1; - - Getopt::Long::GetOptions( - "topic=s" => \$topic_opt, - "import_only!" => \$import_only, - "resume_snapshot!" => \$resume_snapshot, - "skip_post_processing!" => \$skip_post_processing, - "snapshots_id=i" => \$snapshots_id - ) || return; - - my $args_list = [ qw(import_only skip_post_processing snapshots_id resume_snapshot) ]; - my $optional_args = join( ' ', map { "[ --$_ ]" } @{ $args_list } ); - die( "usage: $0 --topic < id > $optional_args" ) unless ( $topic_opt ); - - my $db = MediaWords::DB::connect_to_db(); - my $topics = MediaWords::TM::CLI::require_topics_by_opt( $db, $topic_opt ); - unless ( $topics ) - { - die "Unable to find topics for option '$topic_opt'"; - } - - for my $topic ( @{ $topics } ) - { - my $topics_id = $topic->{ topics_id }; - INFO "Processing topic $topics_id..."; - - if ( $resume_snapshot ) - { - ( $snapshots_id ) = $db->query( <flat(); -select * from snapshots where topics_id = ? order by snapshots_id desc limit 1 -SQL - die( "no snapshot found for topic $topic->{ topics_id }" ) unless ( $snapshots_id ); - } - - my $args = { - topics_id => $topics_id, - import_only => $import_only, - skip_post_processing => $skip_post_processing, - snapshots_id => $snapshots_id, - }; - - MediaWords::TM::Mine::mine_topic( $db, $topic, $args ); - - INFO "Done processing topic $topics_id."; - } -} - -main(); diff --git a/apps/topics-mine/bin/mine_topic.py b/apps/topics-mine/bin/mine_topic.py new file mode 100755 index 0000000000..de0084c1ed --- /dev/null +++ b/apps/topics-mine/bin/mine_topic.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +import argparse + +from mediawords.db import connect_to_db +from topics_mine.mine import mine_topic + +def main(): + """run mine_topic with cli args.""" + parser = argparse.ArgumentParser(description="Run topics_mine job.") + parser.add_argument("-t", "--topics_id", type=int, required=True) + parser.add_argument("-s", "--snapshots_id", type=int, required=False) + parser.add_argument("-r", "--resume_snapshot", type=bool, required=False) + parser.add_argument("-i", "--import_only", type=bool, required=False) + parser.add_argument("-p", "--skip_post_processing", type=bool, required=False) + args = parser.parse_args() + + snapshots_id = args.snapshots_id + if args.resume_snapshot: + snapshots_id = db.query( + "select snapshots_id from snapshots where topics_id = %(a)s order by snapshots_id desc limit 1", + {'a': args.topics_id}).flat()[0] + + + db = connect_to_db() + + topic = db.require_by_id('topics', args.topics_id) + + mine_topic( + db=db, + topic=topic, + snapshots_id=snapshots_id, + import_only=args.import_only, + skip_post_processing=args.skip_post_processing) + +main() diff --git a/apps/topics-mine/bin/topics_mine_worker.pl b/apps/topics-mine/bin/topics_mine_worker.pl deleted file mode 100755 index 7f4636ebe5..0000000000 --- a/apps/topics-mine/bin/topics_mine_worker.pl +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use MediaWords::TM::Worker; - - -sub main() -{ - MediaWords::TM::Worker::start_topics_mine_worker( 'MediaWords::Job::TM::MineTopic' ); -} - -main(); diff --git a/apps/topics-mine/bin/topics_mine_worker.py b/apps/topics-mine/bin/topics_mine_worker.py new file mode 100755 index 0000000000..3eb0963105 --- /dev/null +++ b/apps/topics-mine/bin/topics_mine_worker.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +from mediawords.job import JobBroker +from mediawords.util.log import create_logger +from topics_mine.mine import run_worker_job + +log = create_logger(__name__) + +QUEUE_NAME = 'MediaWords::Job::TM::MineTopic' + +if __name__ == '__main__': + app = JobBroker(queue_name=QUEUE_NAME) + app.start_worker(handler=run_worker_job) diff --git a/apps/topics-mine/docker-compose.tests.yml b/apps/topics-mine/docker-compose.tests.yml index ed2e90a371..ec6299230e 100644 --- a/apps/topics-mine/docker-compose.tests.yml +++ b/apps/topics-mine/docker-compose.tests.yml @@ -42,6 +42,7 @@ services: - postgresql-pgbouncer - rabbitmq-server - topics-fetch-link + - topics-fetch-twitter-urls - topics-extract-story-links # 1) test_topics_mine.t calls topics-fetch-link # 2) topics-fetch-link calls _try_fetch_topic_url() @@ -49,6 +50,9 @@ services: # 4) generate_story() calls _extract_story() # 5) _extract_story() runs a remote extraction job - extract-and-vector + - solr-shard-01 + - import-solr-data-for-testing + - facebook-fetch-story-stats extract-and-vector: image: gcr.io/mcback/extract-and-vector:latest @@ -127,6 +131,27 @@ services: source: ./../rabbitmq-server/conf/ target: /etc/rabbitmq/ + topics-fetch-twitter-urls: + image: dockermediacloud/topics-fetch-twitter-urls:latest + init: true + stop_signal: SIGKILL + volumes: + - type: bind + source: ./../topics-fetch-twitter-urls/bin/ + target: /opt/mediacloud/bin/ + - type: bind + source: ./../topics-fetch-twitter-urls/src/ + target: /opt/mediacloud/src/topics-fetch-twitter-urls/ + - type: bind + source: ./../topics-base/src/ + target: /opt/mediacloud/src/topics-base/ + - type: bind + source: ./../common/src/ + target: /opt/mediacloud/src/common/ + depends_on: + - postgresql-pgbouncer + - rabbitmq-server + topics-fetch-link: image: gcr.io/mcback/topics-fetch-link:latest init: true @@ -174,3 +199,83 @@ services: - postgresql-pgbouncer # Uses extractor HTTP service directly to get raw extracted HTML: - extract-article-from-page + + import-solr-data-for-testing: + image: dockermediacloud/import-solr-data-for-testing:latest + init: true + environment: + MC_SOLR_IMPORT_MAX_QUEUED_STORIES: 100000 + stop_signal: SIGKILL + volumes: + - type: bind + source: ./../import-solr-data-for-testing/bin/ + target: /opt/mediacloud/bin/ + - type: bind + source: ./../import-solr-data/src/ + target: /opt/mediacloud/src/import-solr-data/ + - type: bind + source: ./../common/src/ + target: /opt/mediacloud/src/common/ + depends_on: + - postgresql-pgbouncer + - solr-shard-01 + + solr-shard-01: + image: dockermediacloud/solr-shard:latest + init: true + stop_signal: SIGKILL + environment: + MC_SOLR_SHARD_COUNT: "1" + expose: + - 8983 + volumes: + - type: bind + source: ./../solr-base/src/solr/ + target: /usr/src/solr/ + - type: bind + source: ./../solr-shard/bin/solr-shard.sh + target: /solr-shard.sh + depends_on: + - solr-zookeeper + + solr-zookeeper: + image: dockermediacloud/solr-zookeeper:latest + init: true + stop_signal: SIGKILL + expose: + - 2181 + - 2888 + - 3888 + volumes: + - type: bind + source: ./../solr-zookeeper/conf/ + target: /opt/zookeeper/conf/ + - type: bind + source: ./../solr-zookeeper/bin/zookeeper.sh + target: /zookeeper.sh + + facebook-fetch-story-stats: + image: dockermediacloud/facebook-fetch-story-stats:latest + init: true + stop_signal: SIGKILL + environment: + MC_FACEBOOK_APP_ID: "IGNORE NOT NEEDED" + MC_FACEBOOK_APP_SECRET: "IGNORE NOT NEEEDED" + volumes: + - type: bind + source: ./../facebook-fetch-story-stats/bin/ + target: /opt/mediacloud/bin/ + - type: bind + source: ./../facebook-fetch-story-stats/src/ + target: /opt/mediacloud/src/facebook-fetch-story-stats/ + - type: bind + source: ./../facebook-fetch-story-stats/tests/ + target: /opt/mediacloud/tests/ + - type: bind + source: ./../common/src/ + target: /opt/mediacloud/src/common/ + depends_on: + - postgresql-pgbouncer + - rabbitmq-server + + diff --git a/apps/topics-mine/src/cpanfile b/apps/topics-mine/src/cpanfile deleted file mode 100644 index dca604e1ea..0000000000 --- a/apps/topics-mine/src/cpanfile +++ /dev/null @@ -1,3 +0,0 @@ -requires 'Date::Format'; -requires 'Text::Lorem::More'; -requires 'Time::Piece'; diff --git a/apps/topics-mine/src/perl/MediaWords/Config/TopicsMine.pm b/apps/topics-mine/src/perl/MediaWords/Config/TopicsMine.pm deleted file mode 100644 index 2f521c07f8..0000000000 --- a/apps/topics-mine/src/perl/MediaWords/Config/TopicsMine.pm +++ /dev/null @@ -1,36 +0,0 @@ -package MediaWords::Util::Config::TopicsMine; - -use strict; -use warnings; - -use Modern::Perl "2015"; - -# Deliberately don't include MediaWords::CommonLibs as it includes this package itself - -{ - package MediaWords::Util::Config::TopicsMine::PythonProxy; - - use strict; - use warnings; - - use Modern::Perl "2015"; - use MediaWords::CommonLibs; - - use MediaWords::Util::Python; - - MediaWords::Util::Python::import_python_module( __PACKAGE__, 'topics_mine.config' ); - - 1; -} - -sub _python_config() -{ - return MediaWords::Util::Config::TopicsMine::PythonProxy::TopicsMineConfig->new(); -} - -sub crimson_hexagon_api_key() -{ - return _python_config()->crimson_hexagon_api_key(); -} - -1; diff --git a/apps/topics-mine/src/perl/MediaWords/TM/FetchTopicPosts.pm b/apps/topics-mine/src/perl/MediaWords/TM/FetchTopicPosts.pm deleted file mode 100644 index 7bb4f454d7..0000000000 --- a/apps/topics-mine/src/perl/MediaWords/TM/FetchTopicPosts.pm +++ /dev/null @@ -1,11 +0,0 @@ -package MediaWords::TM::FetchTopicPosts; - -use strict; -use warnings; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -import_python_module( __PACKAGE__, 'topics_mine.fetch_topic_posts' ); - -1; diff --git a/apps/topics-mine/src/perl/MediaWords/TM/Mine.pm b/apps/topics-mine/src/perl/MediaWords/TM/Mine.pm deleted file mode 100644 index 4d0919870c..0000000000 --- a/apps/topics-mine/src/perl/MediaWords/TM/Mine.pm +++ /dev/null @@ -1,1219 +0,0 @@ -package MediaWords::TM::Mine; - -=head1 NAME - -MediaWords::TM::Mine - topic spider implementation - -=head1 SYNOPSIS - - MediaWords::TM::Mine::mine_topic( $db, $options ); - -=head1 DESCRIPTION - -The topic mining process is described in doc/topic_mining.markdown. - -=cut - -use strict; -use warnings; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use Getopt::Long; -use List::MoreUtils; -use List::Util; -use Readonly; -use Time::Piece; - -use MediaWords::TM::Alert; -use MediaWords::TM::FetchTopicPosts; -use MediaWords::TM::Stories; -use MediaWords::DBI::Stories; -use MediaWords::DBI::Stories::GuessDate; -use MediaWords::Job::Broker; -use MediaWords::Job::StatefulBroker; -use MediaWords::Solr; -use MediaWords::Solr::Query; -use MediaWords::Util::SQL; - -# total time to wait for fetching of social media metrics -Readonly my $MAX_SOCIAL_MEDIA_FETCH_TIME => ( 60 * 60 * 24 ); - -# add new links in chunks of this size -Readonly my $ADD_NEW_LINKS_CHUNK_SIZE => 10_000; - -# extract story links in chunks of this size -Readonly my $EXTRACT_STORY_LINKS_CHUNK_SIZE => 1000; - -# query this many topic_links at a time to spider -Readonly my $SPIDER_LINKS_CHUNK_SIZE => 100_000; - -# die if the error rate for link fetch or link extract jobs is greater than this -Readonly my $MAX_JOB_ERROR_RATE => 0.02; - -# timeout when polling for jobs to finish -Readonly my $JOB_POLL_TIMEOUT => 600; - -# number of seconds to wait when polling for jobs to finish -Readonly my $JOB_POLL_WAIT => 5; - -# if more than this many seed urls are imported, dedup stories before as well as after spidering -Readonly my $MIN_SEED_IMPORT_FOR_PREDUP_STORIES => 50_000; - -# how many link extraction jobs per 1000 can we ignore if they hang -Readonly my $MAX_LINK_EXTRACTION_TIMEOUT => 10; - -# how long to wait to timeout link extraction -Readonly my $LINK_EXTRACTION_POLL_TIMEOUT => 600; - -# if mine_topic is run with the test_mode option, set this true and do not try to queue extractions -my $_test_mode; - -# update topics.state in the database -sub update_topic_state($$$) -{ - my ( $db, $state_updater, $message ) = @_; - - INFO( "update topic state: $message" ); - - unless ( $state_updater ) { - # Shouldn't happen but let's just test it here - ERROR "State updater is unset."; - return; - } - - eval { - $state_updater->update_job_state_message( $db, $message ); - }; - if ( $@ ) - { - die "Error updating job state: $@"; - } -} - -# return true if the publish date of the story is within 7 days of the topic date range or if the -# story is undateable -sub story_within_topic_date_range -{ - my ( $db, $topic, $story ) = @_; - - return 1 unless ( $story->{ publish_date } ); - - my $story_date = substr( $story->{ publish_date }, 0, 10 ); - - my $start_date = $topic->{ start_date }; - $start_date = MediaWords::Util::SQL::increment_day( $start_date, -7 ); - $start_date = substr( $start_date, 0, 10 ); - - my $end_date = $topic->{ end_date }; - $end_date = MediaWords::Util::SQL::increment_day( $end_date, 7 ); - $end_date = substr( $end_date, 0, 10 ); - - return 1 if ( ( $story_date ge $start_date ) && ( $story_date le $end_date ) ); - - return MediaWords::DBI::Stories::GuessDate::is_undateable( $db, $story ); -} - -# submit jobs to extract links from the given stories and then poll to wait for the stories to be processed within -# the jobs pool -sub generate_topic_links -{ - my ( $db, $topic, $stories ) = @_; - - INFO "generate topic links: " . scalar( @{ $stories } ); - - my $topic_links = []; - - if ( $topic->{ platform } ne 'web' ) - { - INFO( "skip link generation for non web topic" ); - return; - } - - my $stories_ids_table = $db->get_temporary_ids_table( [ map { $_->{ stories_id } } @{ $stories } ] ); - - $db->query( <{ topics_id } ); -update topic_stories set link_mined = 'f' - where - stories_id in ( select id from $stories_ids_table ) and - topics_id = ? and - link_mined = 't' -SQL - - my $queued_stories_ids = []; - for my $story ( @{ $stories } ) - { - next unless ( story_within_topic_date_range( $db, $topic, $story ) ); - - push( @{ $queued_stories_ids }, $story->{ stories_id } ); - - MediaWords::Job::Broker->new( 'MediaWords::Job::TM::ExtractStoryLinks' )->add_to_queue( - { stories_id => $story->{ stories_id }, topics_id => $topic->{ topics_id } }, # - ); - - TRACE( "queued link extraction for story $story->{ title } $story->{ url }." ); - } - - INFO( "waiting for " . scalar( @{ $queued_stories_ids } ) . " link extraction jobs to finish" ); - - my $queued_ids_table = $db->get_temporary_ids_table( $queued_stories_ids ); - - # poll every $JOB_POLL_WAIT seconds waiting for the jobs to complete. die if the number of stories left to process - # has not shrunk for $EXTRACTION_POLL_TIMEOUT seconds. - my $prev_num_queued_stories = scalar( @{ $stories } ); - my $last_change_time = time(); - while ( 1 ) - { - my $queued_stories = $db->query( <{ topics_id } )->flat(); -select stories_id from topic_stories - where stories_id in ( select id from $queued_ids_table ) and topics_id = ? and link_mined = 'f' -SQL - - my $num_queued_stories = scalar( @{ $queued_stories } ); - - last unless ( $num_queued_stories ); - - $last_change_time = time() if ( $num_queued_stories != $prev_num_queued_stories ); - if ( ( time() - $last_change_time ) > $LINK_EXTRACTION_POLL_TIMEOUT ) - { - my $ids_list = join( ', ', @{ $queued_stories } ); - if ( $num_queued_stories > $MAX_LINK_EXTRACTION_TIMEOUT ) - { - LOGDIE( "Timed out waiting for story link extraction ($ids_list)." ); - } - - $db->query( <{ topics_id } ); -update topic_stories set link_mine_error = 'time out' where stories_id in ( $ids_list ) and topics_id = ? -SQL - last; - } - - INFO( "$num_queued_stories stories left in link extraction pool...." ); - - $prev_num_queued_stories = $num_queued_stories; - sleep( $JOB_POLL_WAIT ); - } - - $db->query( <{ topics_id } ); -update topic_stories set link_mined = 't' - where stories_id in ( select id from $stories_ids_table ) and topics_id = ? and link_mined = 'f' -SQL - $db->query( "drop table $stories_ids_table" ); -} - -# die() with an appropriate error if topic_stories > topics.max_stories; because this check is expensive and we don't -# care if the topic goes over by a few thousand stories, we only actually run the check randmly 1/1000 of the time -sub die_if_max_stories_exceeded($$) -{ - my ( $db, $topic ) = @_; - - my ( $num_topic_stories ) = $db->query( <{ topics_id } )->flat; -select count(*) from topic_stories where topics_id = ? -SQL - - if ( $num_topic_stories > $topic->{ max_stories } ) - { - LOGDIE( "topic has $num_topic_stories stories, which exceeds topic max stories of $topic->{ max_stories }" ); - } -} - -# add the topic_fetch_url to the fetch_link job queue. try repeatedly on failure. -sub queue_topic_fetch_url($;$) -{ - my ( $tfu, $domain_timeout ) = @_; - - $domain_timeout //= $_test_mode ? 0 : undef; - - MediaWords::Job::Broker->new( 'MediaWords::Job::TM::FetchLink' )->add_to_queue( - { - topic_fetch_urls_id => $tfu->{ topic_fetch_urls_id }, - domain_timeout => $domain_timeout - } - ); -} - -# create topic_fetch_urls rows correpsonding to the links and queue a FetchLink job for each. return the tfu rows. -sub create_and_queue_topic_fetch_urls($$$) -{ - my ( $db, $topic, $fetch_links ) = @_; - - my $tfus = []; - for my $link ( @{ $fetch_links } ) - { - if ( $link->{ topic_links_id } && !$db->find_by_id( 'topic_links', $link->{ topic_links_id } ) ) - { - next; - } - my $tfu = $db->create( - 'topic_fetch_urls', - { - topics_id => $topic->{ topics_id }, - url => $link->{ url }, - state => 'pending', - assume_match => MediaWords::Util::Python::normalize_boolean_for_db( $link->{ assume_match } ), - topic_links_id => $link->{ topic_links_id }, - } - ); - push( @{ $tfus }, $tfu ); - - queue_topic_fetch_url( $tfu ); - } - - return $tfus; -} - -sub _fetch_twitter_urls($$$) -{ - my ( $db, $topic, $tfu_ids_list ) = @_; - - # we run into quota limitations with twitter sometimes and need a longer timeout - my $twitter_poll_timeout = $JOB_POLL_TIMEOUT * 5; - - my $twitter_tfu_ids = $db->query( <flat(); -select topic_fetch_urls_id - from topic_fetch_urls tfu - where - tfu.state = 'tweet pending' and - tfu.topic_fetch_urls_id in ( $tfu_ids_list ) -SQL - - return unless ( scalar( @{ $twitter_tfu_ids } ) > 0 ); - - my $tfu_ids_table = $db->get_temporary_ids_table( $twitter_tfu_ids ); - - MediaWords::Job::Broker->new( 'MediaWords::Job::TM::FetchTwitterUrls' )->add_to_queue( - { topic_fetch_urls_ids => $twitter_tfu_ids } - ); - - INFO( "waiting for fetch twitter urls job for " . scalar( @{ $twitter_tfu_ids } ) . " urls" ); - - # poll every $sleep_time seconds waiting for the jobs to complete. die if the number of stories left to process - # has not shrunk for $large_timeout seconds. warn but continue if the number of stories left to process - # is only 5% of the total and short_timeout has passed (this is to make the topic not hang entirely because - # of one link extractor job error). - my $prev_num_queued_urls = scalar( @{ $twitter_tfu_ids } ); - my $last_change_time = time(); - while ( 1 ) - { - my $queued_tfus = $db->query( <hashes(); -select tfu.* - from topic_fetch_urls tfu - join $tfu_ids_table ids on ( tfu.topic_fetch_urls_id = ids.id ) - where - state in ('tweet pending') -SQL - - my $num_queued_urls = scalar( @{ $queued_tfus } ); - - last if ( $num_queued_urls == 0 ); - - $last_change_time = time() if ( $num_queued_urls != $prev_num_queued_urls ); - if ( ( time() - $last_change_time ) > $twitter_poll_timeout ) - { - LOGDIE( "Timed out waiting for twitter fetching.\n" . Dumper( $queued_tfus ) ); - } - - INFO( "$num_queued_urls twitter urls left to fetch ..." ); - - $prev_num_queued_urls = $num_queued_urls; - sleep( $JOB_POLL_WAIT ); - } -} - -# list a sample of the pending urls for fetching -sub show_pending_urls($) -{ - my ( $pending_urls ) = @_; - - my $num_pending_urls = scalar( @{ $pending_urls } ); - - my $num_printed_urls = List::Util::min( $num_pending_urls, 3 ); - - my @shuffled_ids = List::Util::shuffle( 0 .. ( $num_pending_urls - 1 ) ); - - for my $id ( @shuffled_ids[ 0 .. ( $num_printed_urls - 1 ) ] ) - { - my $url = $pending_urls->[ $id ]; - INFO( "pending url: $url->{ url } [$url->{ state }: $url->{ fetch_date }]" ); - } -} - -# fetch the given links by creating topic_fetch_urls rows and sending them to the FetchLink queue -# for processing. wait for the queue to complete and returnt the resulting topic_fetch_urls. -sub fetch_links -{ - my ( $db, $topic, $fetch_links ) = @_; - - INFO( "fetch_links: queue links" ); - my $tfus = create_and_queue_topic_fetch_urls( $db, $topic, $fetch_links ); - my $num_queued_links = scalar( @{ $fetch_links } ); - - INFO( "waiting for fetch link queue: $num_queued_links queued" ); - - my $tfu_ids_list = join( ',', map { int( $_->{ topic_fetch_urls_id } ) } @{ $tfus } ); - - my $requeues = 0; - my $max_requeues = 1; - my $max_requeue_jobs = 100; - my $requeue_timeout = 30; - my $instant_requeued = 0; - - # once the pool is this small, just requeue everything with a 0 per site throttle - my $instant_queue_size = 25; - - # how many times to requeues everything if there is no change for $JOB_POLL_TIMEOUT seconds - my $full_requeues = 0; - my $max_full_requeues = 1; - - my $last_pending_change = time(); - my $last_num_pending_urls = 0; - while ( 1 ) - { - my $pending_urls = $db->query( <hashes(); -select *, coalesce( fetch_date::text, 'null' ) fetch_date - from topic_fetch_urls - where - topic_fetch_urls_id in ( $tfu_ids_list ) and - state in ( 'pending', 'requeued' ) -SQL - - my $pending_url_ids = [ map { $_->{ topic_fetch_urls_id } } @{ $pending_urls } ]; - - my $num_pending_urls = scalar( @{ $pending_url_ids } ); - - INFO( "waiting for fetch link queue: $num_pending_urls links remaining ..." ); - - show_pending_urls( $pending_urls ); - - last if ( $num_pending_urls < 1 ); - - # if we only have a handful of job left, requeue them all once with a 0 domain throttle - if ( !$instant_requeued && ( $num_pending_urls <= $instant_queue_size ) ) - { - $instant_requeued = 1; - map { queue_topic_fetch_url( $db->require_by_id( 'topic_fetch_urls', $_ ), 0 ) } @{ $pending_url_ids }; - sleep( $JOB_POLL_WAIT ); - next; - } - - my $time_since_change = time() - $last_pending_change; - - # for some reason, the fetch_link queue is occasionally losing a small number of jobs. - if ( ( $time_since_change > $requeue_timeout ) - && ( $requeues < $max_requeues ) - && ( $num_pending_urls < $max_requeue_jobs ) ) - { - INFO( "requeueing fetch_link $num_pending_urls jobs ... [requeue $requeues]" ); - - # requeue with a domain_timeout of 0 so that requeued urls can ignore throttling - map { queue_topic_fetch_url( $db->require_by_id( 'topic_fetch_urls', $_ ), 0 ) } @{ $pending_url_ids }; - ++$requeues; - $last_pending_change = time(); - } - - if ( $time_since_change > $JOB_POLL_TIMEOUT ) - { - if ( $num_pending_urls > $max_requeue_jobs ) - { - die( "timed out waiting for fetch_link jobs: " . scalar( @{ $pending_url_ids } ) ); - } - elsif ( $full_requeues < $max_full_requeues ) - { - map { queue_topic_fetch_url( $db->require_by_id( 'topic_fetch_urls', $_ ) ) } @{ $pending_url_ids }; - ++$full_requeues; - $last_pending_change = time(); - } - else - { - for my $id ( @{ $pending_url_ids } ) - { - $db->update_by_id( 'topic_fetch_urls', $id, { state => 'python error', message => 'timed out' } ); - } - INFO( "timed out " . scalar( @{ $pending_url_ids } ) . " urls" ); - } - } - - $last_pending_change = time() if ( $num_pending_urls < $last_num_pending_urls ); - - $last_num_pending_urls = $num_pending_urls; - - sleep( $JOB_POLL_WAIT ); - } - - _fetch_twitter_urls( $db, $topic, $tfu_ids_list ); - - INFO( "fetch_links: update topic seed urls" ); - $db->query( <query( <hashes(); -select * from topic_fetch_urls where topic_fetch_urls_id in ( $tfu_ids_list ) -SQL - - INFO( "completed fetch link queue" ); - - return $completed_tfus; -} - -# download any unmatched link in new_links, add it as a story, extract it, add any links to the topic_links list. -# each hash within new_links can either be a topic_links hash or simply a hash with a { url } field. if -# the link is a topic_links hash, the topic_link will be updated in the database to point ref_stories_id -# to the new link story. For each link, set the { story } field to the story found or created for the link. -sub add_new_links_chunk($$$$) -{ - my ( $db, $topic, $iteration, $new_links ) = @_; - - die_if_max_stories_exceeded( $db, $topic ); - - INFO( "add_new_links_chunk: fetch_links" ); - my $topic_fetch_urls = fetch_links( $db, $topic, $new_links ); - - INFO( "add_new_links_chunk: mark topic links spidered" ); - my $link_ids = [ grep { $_ } map { $_->{ topic_links_id } } @{ $new_links } ]; - $db->query( < $topic->{ topics_id }, - iteration => $iteration, - links_processed => $num_links, - elapsed_time => $elapsed_time - }; - - $db->create( 'topic_spider_metrics', $topic_spider_metric ); -} - -# call add_new_links in chunks of $ADD_NEW_LINKS_CHUNK_SIZE so we don't lose too much work when we restart the spider -sub add_new_links($$$$;$) -{ - my ( $db, $topic, $iteration, $new_links, $state_updater ) = @_; - - INFO( "add new links" ); - - return unless ( @{ $new_links } ); - - # randomly shuffle the links because it is better for downloading (which has per medium throttling) and extraction - # (which has per medium locking) to distribute urls from the same media source randomly among the list of links. the - # link mining and solr seeding routines that feed most links to this function tend to naturally group links - # from the same media source together. - my $shuffled_links = [ List::Util::shuffle( @{ $new_links } ) ]; - - my $spider_progress = get_spider_progress_description( $db, $topic, $iteration, scalar( @{ $shuffled_links } ) ); - - my $num_links = scalar( @{ $shuffled_links } ); - for ( my $i = 0 ; $i < $num_links ; $i += $ADD_NEW_LINKS_CHUNK_SIZE ) - { - my $start_time = time; - - update_topic_state( $db, $state_updater, "$spider_progress; iteration links: $i / $num_links" ); - - my $end = List::Util::min( $i + $ADD_NEW_LINKS_CHUNK_SIZE - 1, $#{ $shuffled_links } ); - add_new_links_chunk( $db, $topic, $iteration, [ @{ $shuffled_links }[ $i .. $end ] ] ); - - my $elapsed_time = time - $start_time; - save_metrics( $db, $topic, $iteration, $end - $i, $elapsed_time ); - } - - mine_topic_stories( $db, $topic ); -} - -# find any links for the topic of this iteration or less that have not already been spidered and call -# add_new_links on them. -sub spider_new_links($$$;$) -{ - my ( $db, $topic, $iteration, $state_updater ) = @_; - - while ( 1 ) - { - INFO( "querying new links ..." ); - - $db->query( "drop table if exists _new_links" ); - - my $num_new_links = $db->query( <{ topics_id } )->rows(); -create temporary table _new_links as - select tl.* - from topic_links tl, topic_stories ts - where - tl.link_spidered = 'f' and - tl.stories_id = ts.stories_id and - ( ts.iteration <= \$1 or ts.iteration = 1000 ) and - ts.topics_id = \$2 and - tl.topics_id = \$2 - order by random() -END - - $db->query( "create index _new_links_tl on _new_links ( topic_links_id )" ); - - last if ( $num_new_links < 1 ); - - INFO( "found $num_new_links new links" ); - - while ( 1 ) - { - my $new_links = $db->query( "select * from _new_links limit ?", $SPIDER_LINKS_CHUNK_SIZE )->hashes(); - - last unless ( @{ $new_links } ); - - my $tl_ids_list = join( ',', map { $_->{ topic_links_id } } @{ $new_links } ); - $db->query( "delete from _new_links where topic_links_id in ($tl_ids_list)" ); - add_new_links( $db, $topic, $iteration, $new_links, $state_updater ); - } - } -} - -# get short text description of spidering progress -sub get_spider_progress_description($$$$) -{ - my ( $db, $topic, $iteration, $total_links ) = @_; - - INFO( "get spider progress description" ); - - my $cid = $topic->{ topics_id }; - - my ( $total_stories ) = $db->query( <flat; -select count(*) from topic_stories where topics_id = ? -SQL - - my ( $stories_last_iteration ) = $db->query( <flat; -select count(*) from topic_stories where topics_id = ? and iteration = ? - 1 -SQL - - my ( $queued_links ) = $db->query( <flat; -select count(*) from topic_links where topics_id = ? and link_spidered = 'f' -SQL - - return "spidering iteration: $iteration; stories last iteration / total: " . - "$stories_last_iteration / $total_stories; links queued: $queued_links; iteration links: $total_links"; -} - -# run the spider over any new links, for $num_iterations iterations -sub run_spider($$;$) -{ - my ( $db, $topic, $state_updater ) = @_; - - INFO( "run spider" ); - - # before we run the spider over links, we need to make sure links have been generated for all existing stories - mine_topic_stories( $db, $topic ); - - map { spider_new_links( $db, $topic, $topic->{ max_iterations }, $state_updater ) } ( 1 .. $topic->{ max_iterations } ); -} - -# mine for links any stories in topic_stories for this topic that have not already been mined -sub mine_topic_stories -{ - my ( $db, $topic ) = @_; - - INFO( "mine topic stories" ); - - # skip for non-web topic, because the below query grows very large without ever mining links - if ( $topic->{ platform } ne 'web' ) - { - INFO( "skip link generation for non-web topic" ); - return; - } - - # chunk the story extractions so that one big topic does not take over the entire queue - my $i = 0; - while ( 1 ) - { - $i += $EXTRACT_STORY_LINKS_CHUNK_SIZE; - INFO( "mine topic stories: chunked $i ..." ); - my $stories = $db->query( <{ topics_id }, $EXTRACT_STORY_LINKS_CHUNK_SIZE )->hashes; - select s.*, ts.link_mined, ts.redirect_url - from snap.live_stories s - join topic_stories ts on ( s.stories_id = ts.stories_id and s.topics_id = ts.topics_id ) - where - ts.link_mined = false and - ts.topics_id = ? - limit ? -SQL - - my $num_stories = scalar( @{ $stories } ); - - last if ( $num_stories == 0 ); - - generate_topic_links( $db, $topic, $stories ); - - last if ( $num_stories < $EXTRACT_STORY_LINKS_CHUNK_SIZE ); - } -} - -# import all topic_seed_urls that have not already been processed; -# return 1 if new stories were added to the topic and 0 if not -sub import_seed_urls($$;$) -{ - my ( $db, $topic, $state_updater ) = @_; - - INFO( "import seed urls" ); - - my $topics_id = $topic->{ topics_id }; - - # take care of any seed urls with urls that we have already processed for this topic - $db->query( <query( <hashes; -select * from topic_seed_urls where topics_id = ? and processed = 'f' order by random() -END - - return 0 unless ( @{ $seed_urls } ); - - # process these in chunks in case we have to start over so that we don't have to redo the whole batch - my $num_urls = scalar( @{ $seed_urls } ); - for ( my $i = 0 ; $i < $num_urls ; $i += $ADD_NEW_LINKS_CHUNK_SIZE ) - { - my $start_time = time; - - update_topic_state( $db, $state_updater, "importing seed urls: $i / $num_urls" ); - - my $end = List::Util::min( $i + $ADD_NEW_LINKS_CHUNK_SIZE - 1, $#{ $seed_urls } ); - - # verify that the seed urls are still there and not processed, in case we have mucked with them while spidering - my $urls_ids_list = join( ',', map { int( $_->{ topic_seed_urls_id } ) } @{ $seed_urls }[ $i .. $end] ); - my $seed_urls_chunk = $db->query( <hashes(); -select * from topic_seed_urls where topic_seed_urls_id in ( $urls_ids_list ) and not processed -SQL - - add_new_links_chunk( $db, $topic, 0, $seed_urls_chunk ); - - my $ids_list = join( ',', map { int( $_->{ topic_seed_urls_id } ) } @{ $seed_urls_chunk } ); - - # update topic_seed_urls that were actually fetched - $db->query( <query( <query( - <{ topics_id } - ); - - return scalar( @{ $seed_urls } ); -} - - -# insert a list of topic seed urls -sub insert_topic_seed_urls -{ - my ( $db, $topic_seed_urls ) = @_; - - INFO "inserting " . scalar( @{ $topic_seed_urls } ) . " topic seed urls ..."; - - for my $tsu ( @{ $topic_seed_urls } ) - { - my $insert_tsu; - map { $insert_tsu->{ $_ } = $tsu->{ $_ } } qw/stories_id url topics_id assume_match/; - $db->create( 'topic_seed_urls', $insert_tsu ); - } -} - -# return true if the given month offset is within the dates that should be respidered. always return true -# if there are not respider dates -sub _import_month_within_respider_date($$) -{ - my ( $topic, $month_offset ) = @_; - - my $start_date = $topic->{ respider_start_date } || '';; - my $end_date = $topic->{ respider_end_date } || ''; - - return 1 unless ( $topic->{ respider_stories } && ( $start_date || $end_date ) ); - - my $month_date = Time::Piece->strptime( $topic->{ start_date }, "%Y-%m-%d" )->add_months( $month_offset ); - - if ( $end_date ) - { - my $end_date = Time::Piece->strptime( $end_date, "%Y-%m-%d" )->add_months( -1 ); - return 1 if ( $month_date > $end_date ); - } - - if ( $start_date ) - { - my $start_date = Time::Piece->strptime( $start_date, "%Y-%m-%d" ); - return 1 if ( $month_date < $start_date ); - } - - return 0; -} - -# Call search_solr_for_stories_ids() above and then query PostgreSQL for the stories returned by Solr. -# Include stories.* and media_name as the returned fields. -sub __search_for_stories($$) -{ - my ( $db, $params ) = @_; - - my $stories_ids = MediaWords::Solr::search_solr_for_stories_ids( $db, $params ); - - my $stories = [ map { { stories_id => $_ } } @{ $stories_ids } ]; - - $stories = MediaWords::DBI::Stories::attach_story_meta_data_to_stories( $db, $stories ); - - $stories = [ grep { $_->{ url } } @{ $stories } ]; - - return $stories; -} - -# import a single month of the solr seed query. we do this to avoid giant queries that timeout in solr. -sub import_solr_seed_query_month($$$) -{ - my ( $db, $topic, $month_offset ) = @_; - - return 0 unless ( $topic->{ platform } eq 'web' ); - - my $solr_query = MediaWords::Solr::Query::get_full_solr_query_for_topic( $db, $topic, undef, undef, $month_offset ); - - # this should return undef once the month_offset gets too big - return undef unless ( $solr_query ); - - return 1 unless ( _import_month_within_respider_date( $topic, $month_offset ) ); - - my $max_stories = $topic->{ max_stories }; - - # if solr maxes out on returned stories, it returns a few documents less than the rows= parameter, so we - # assume that we hit the solr max if we are within 5% of the ma stories - my $max_returned_stories = $max_stories * 0.95; - - INFO "import solr seed query month offset $month_offset"; - $solr_query->{ rows } = $max_stories; - - my $stories = __search_for_stories( $db, $solr_query ); - - if ( scalar( @{ $stories } ) > $max_returned_stories ) - { - die( "solr_seed_query returned more than $max_returned_stories stories" ); - } - - INFO "adding " . scalar( @{ $stories } ) . " stories to topic_seed_urls"; - - my $topic_seed_urls = []; - for my $story ( @{ $stories } ) - { - push( - @{ $topic_seed_urls }, - { - topics_id => $topic->{ topics_id }, - url => $story->{ url }, - stories_id => $story->{ stories_id }, - assume_match => 'f' - } - ); - } - - insert_topic_seed_urls( $db, $topic_seed_urls ); - - return 1; -} - -# import stories intro topic_seed_urls from solr by running -# topic->{ solr_seed_query } against solr. if the solr query has -# already been imported, do nothing. -sub import_solr_seed_query -{ - my ( $db, $topic ) = @_; - - INFO( "import solr seed query" ); - - return if ( $topic->{ solr_seed_query_run } ); - - my $month_offset = 0; - while ( import_solr_seed_query_month( $db, $topic, $month_offset++ ) ) { } - - $db->query( "update topics set solr_seed_query_run = 't' where topics_id = ?", $topic->{ topics_id } ); -} - -# return true if there are no stories without facebook data -sub all_facebook_data_fetched -{ - my ( $db, $topic ) = @_; - - my $null_facebook_story = $db->query( <{ topics_id } )->hash; -select 1 - from topic_stories cs - left join story_statistics ss on ( cs.stories_id = ss.stories_id ) - where - cs.topics_id = ? and - ss.facebook_api_error is null and - ( - ss.stories_id is null or - ss.facebook_share_count is null or - ss.facebook_comment_count is null or - ss.facebook_api_collect_date is null - ) - limit 1 -SQL - - return !$null_facebook_story; -} - -# add all topic stories without facebook data to the queue -sub __add_topic_stories_to_facebook_queue($$) -{ - my ( $db, $topic ) = @_; - - my $topics_id = $topic->{ topics_id }; - - my $stories = $db->query( <hashes; -SELECT ss.*, cs.stories_id - FROM topic_stories cs - left join story_statistics ss on ( cs.stories_id = ss.stories_id ) - WHERE cs.topics_id = ? - ORDER BY cs.stories_id -END - - unless ( scalar @{ $stories } ) - { - DEBUG( "No stories found for topic '$topic->{ name }'" ); - } - - for my $ss ( @{ $stories } ) - { - my $stories_id = $ss->{ stories_id }; - my $args = { stories_id => $stories_id }; - - if ( $ss->{ facebook_api_error } - or !defined( $ss->{ facebook_api_collect_date } ) - or !defined( $ss->{ facebook_share_count } ) - or !defined( $ss->{ facebook_comment_count } ) ) - { - DEBUG( "Adding job for story $stories_id" ); - MediaWords::Job::Broker->new( 'MediaWords::Job::Facebook::FetchStoryStats' )->add_to_queue( $args ); - } - } -} - -# send high priority jobs to fetch facebook data for all stories that don't yet have it -sub fetch_social_media_data ($$) -{ - my ( $db, $topic ) = @_; - - INFO( "fetch social media data" ); - - # test spider should be able to run with job broker, so we skip social media collection - return if ( $_test_mode ); - - my $cid = $topic->{ topics_id }; - - __add_topic_stories_to_facebook_queue( $db, $topic ); - - my $poll_wait = 30; - my $retries = int( $MAX_SOCIAL_MEDIA_FETCH_TIME / $poll_wait ) + 1; - - for my $i ( 1 .. $retries ) - { - return if ( all_facebook_data_fetched( $db, $topic ) ); - sleep $poll_wait; - } - - LOGCONFESS( "Timed out waiting for social media data" ); -} - -# die if the error rate for link extraction or link fetching is too high -sub check_job_error_rate($$) -{ - my ( $db, $topic ) = @_; - - INFO( "check job error rate" ); - - my $fetch_stats = $db->query( <{ topics_id } )->hashes(); -select count(*) num, ( state = 'python error' ) as error - from topic_fetch_urls - where topics_id = ? - group by ( state = 'python error' ) -SQL - - my ( $num_fetch_errors, $num_fetch_successes ) = ( 0, 0 ); - for my $s ( @{ $fetch_stats } ) - { - if ( $s->{ error } ) { $num_fetch_errors += $s->{ num } } - else { $num_fetch_successes += $s->{ num } } - } - - my $fetch_error_rate = $num_fetch_errors / ( $num_fetch_errors + $num_fetch_successes + 1 ); - - INFO( "Fetch error rate: $fetch_error_rate ($num_fetch_errors / $num_fetch_successes)" ); - - if ( $fetch_error_rate > $MAX_JOB_ERROR_RATE ) - { - die( "Fetch error rate of $fetch_error_rate is greater than max of $MAX_JOB_ERROR_RATE" ); - } - - my $link_stats = $db->query( <{ topics_id } )->hashes(); -select count(*) num, ( length( link_mine_error) > 0 ) as error - from topic_stories - where topics_id = ? - group by ( length( link_mine_error ) > 0 ) -SQL - - my ( $num_link_errors, $num_link_successes ) = ( 0, 0 ); - for my $s ( @{ $link_stats } ) - { - if ( $s->{ error } ) { $num_link_errors += $s->{ num } } - else { $num_link_successes += $s->{ num } } - } - - my $link_error_rate = $num_link_errors / ( $num_link_errors + $num_link_successes + 1 ); - - INFO( "Link error rate: $link_error_rate ($num_link_errors / $num_link_successes)" ); - - if ( $link_error_rate > $MAX_JOB_ERROR_RATE ) - { - die( "link error rate of $link_error_rate is greater than max of $MAX_JOB_ERROR_RATE" ); - } -} - -# import urls from seed query -sub import_urls_from_seed_queries($$;$) -{ - my ( $db, $topic, $state_updater ) = @_; - - my $topic_seed_queries = $db->query( - "select * from topic_seed_queries where topics_id = ?", $topic->{ topics_id } )->hashes(); - - my $num_queries = scalar( @{ $topic_seed_queries } ); - - if ( ( $num_queries != 1 ) && ( $topic->{ mode } eq 'url_sharing' )) - { - die( "exactly one topic seed query required per url_sharing topic" ); - } - - if ( $topic->{ mode } eq 'web' ) - { - DEBUG( "import seed urls from solr" ); - update_topic_state( $db, $state_updater, "importing solr seed query" ); - import_solr_seed_query( $db, $topic ); - } - - for my $tsq ( @{ $topic_seed_queries } ) - { - my $tsq_dump = $tsq->{ topic_seed_queries_id }; - my $fetcher = MediaWords::TM::FetchTopicPosts::get_post_fetcher( $tsq ); - die( "unable to import seed urls for platform/source of seed query: $tsq_dump" ) unless ( $fetcher ); - - DEBUG( "import seed urls from fetch_topic_posts:\n$tsq_dump" ); - MediaWords::TM::FetchTopicPosts::fetch_topic_posts( $db, $tsq ); - } - - $db->query( <{ topics_id } ); -insert into topic_seed_urls ( url, topics_id, assume_match, source, topic_seed_queries_id, topic_post_urls_id ) - select distinct - tpu.url, - tsq.topics_id, - false, - 'topic_seed_queries', - tsq.topic_seed_queries_id, - tpu.topic_post_urls_id - from - topic_post_urls tpu - join topic_posts tp using ( topic_posts_id ) - join topic_post_days tpd using ( topic_post_days_id ) - join topic_seed_queries tsq using ( topic_seed_queries_id ) - where - tsq.topics_id = ? - on conflict ( topic_post_urls_id ) do nothing -SQL -} - -# if the query or dates have changed, set topic_stories.link_mined to false for the impacted stories so that -# they will be respidered -sub set_stories_respidering($$$) -{ - my ( $db, $topic, $snapshots_id ) = @_; - - return unless ( $topic->{ respider_stories } ); - - my $respider_start_date = $topic->{ respider_start_date }; - my $respider_end_date = $topic->{ respider_end_date }; - - if ( !$respider_start_date && !$respider_end_date ) - { - $db->query( "update topic_stories set link_mined = 'f' where topics_id = ?", $topic->{ topics_id } ); - return; - } - - if ( $respider_start_date ) - { - $db->query( <{ start_date }, $topic->{ topics_id } ); -update topic_stories ts set link_mined = 'f' - from stories s - where - ts.stories_id = s.stories_id and - s.publish_date >= \$2 and - s.publish_date <= \$1 and - ts.topics_id = \$3 -SQL - if ( $snapshots_id ) - { - $db->update_by_id( 'snapshots', $snapshots_id, { start_date => $topic->{ start_date } } ); - $db->query( <query( <{ end_date }, $topic->{ topics_id } ); -update topic_stories ts set link_mined = 'f' - from stories s - where - ts.stories_id = s.stories_id and - s.publish_date >= \$1 and - s.publish_date <= \$2 and - ts.topics_id = \$3 -SQL - - if ( $snapshots_id ) - { - $db->update_by_id( 'snapshots', $snapshots_id, { end_date => $topic->{ end_date } } ); - $db->query( < ? -SQL - } - } - - $db->update_by_id( 'topics', $topic->{ topics_id }, - { respider_stories => 'f', respider_start_date => undef, respider_end_date => undef } ); -} - - -# mine the given topic for links and to recursively discover new stories on the web. -# options: -# import_only - only run import_seed_urls and import_solr_seed and exit -# skip_post_processing - skip social media fetching and snapshotting -# snapshots_id - associate topic with the given existing snapshot -sub do_mine_topic($$;$$) -{ - my ( $db, $topic, $options, $state_updater ) = @_; - - map { $options->{ $_ } ||= 0 } qw/import_only skip_post_processing test_mode/; - - update_topic_state( $db, $state_updater, "importing seed urls" ); - import_urls_from_seed_queries( $db, $topic, $state_updater ); - - update_topic_state( $db, $state_updater, "setting stories respidering..." ); - set_stories_respidering( $db, $topic, $options->{ snapshots_id } ); - - # this may put entires into topic_seed_urls, so run it before import_seed_urls. - # something is breaking trying to call this perl. commenting out for time being since we only need - # this when we very rarely change the foreign_rss_links field of a media source - hal - # update_topic_state( $db, $state_updater, "merging foreign rss stories" ); - # MediaWords::TM::Stories::merge_foreign_rss_stories( $db, $topic ); - - update_topic_state( $db, $state_updater, "importing seed urls" ); - if ( import_seed_urls( $db, $topic, $state_updater ) > $MIN_SEED_IMPORT_FOR_PREDUP_STORIES ) - { - # merge dup stories before as well as after spidering to avoid extra spidering work - update_topic_state( $db, $state_updater, "merging duplicate stories" ); - MediaWords::TM::Stories::find_and_merge_dup_stories( $db, $topic ); - } - - unless ( $options->{ import_only } ) - { - update_topic_state( $db, $state_updater, "running spider" ); - run_spider( $db, $topic, $state_updater ); - - check_job_error_rate( $db, $topic ); - - # merge dup media and stories again to catch dups from spidering - update_topic_state( $db, $state_updater, "merging duplicate stories" ); - MediaWords::TM::Stories::find_and_merge_dup_stories( $db, $topic ); - - update_topic_state( $db, $state_updater, "merging duplicate media stories" ); - MediaWords::TM::Stories::merge_dup_media_stories( $db, $topic ); - - if ( !$options->{ skip_post_processing } ) - { - update_topic_state( $db, $state_updater, "fetching social media data" ); - fetch_social_media_data( $db, $topic ); - - update_topic_state( $db, $state_updater, "snapshotting" ); - my $snapshot_args = { topics_id => $topic->{ topics_id }, snapshots_id => $options->{ snapshots_id } }; - MediaWords::Job::StatefulBroker->new( 'MediaWords::Job::TM::SnapshotTopic' )->add_to_queue( $snapshot_args ); - } - } -} - -# wrap do_mine_topic in eval and handle errors and state -sub mine_topic ($$;$$) -{ - my ( $db, $topic, $options, $state_updater ) = @_; - - # the topic spider can sit around for long periods doing solr queries, so we need to make sure the postgres - # connection does not get timed out - $db->query( "set idle_in_transaction_session_timeout = 0" ); - - my $prev_test_mode = $_test_mode; - - $_test_mode = 1 if ( $options->{ test_mode } ); - - if ( $topic->{ state } ne 'running' ) - { - MediaWords::TM::Alert::send_topic_alert( $db, $topic, "started topic spidering" ); - } - - eval { do_mine_topic( $db, $topic, $options, $state_updater ); }; - if ( $@ ) - { - my $error = $@; - MediaWords::TM::Alert::send_topic_alert( $db, $topic, "aborted topic spidering due to error" ); - LOGDIE( $error ); - } - - $_test_mode = $prev_test_mode; -} - -1; diff --git a/apps/topics-mine/src/perl/MediaWords/TM/Worker.pm b/apps/topics-mine/src/perl/MediaWords/TM/Worker.pm deleted file mode 100644 index ce467d6869..0000000000 --- a/apps/topics-mine/src/perl/MediaWords/TM/Worker.pm +++ /dev/null @@ -1,95 +0,0 @@ -package MediaWords::TM::Worker; - -# -# Run through stories found for the given topic and find all the links in -# each story. -# -# For each link, try to find whether it matches any given story. If it doesn't, -# create a new story. Add that story's links to the queue if it matches the -# pattern for the topic. Write the resulting stories and links to -# topic_stories and topic_links. -# -# Options: -# -# * dedup_stories - run story deduping code over existing topic stories; -# only necessary to rerun new dedup code -# -# * import_only - only run import_seed_urls and import_solr_seed and return -# - -use strict; -use warnings; - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use MediaWords::DB; -use MediaWords::Job::Lock; -use MediaWords::Job::State; -use MediaWords::Job::State::ExtraTable; -use MediaWords::Job::StatefulBroker; -use MediaWords::TM::Mine; - - -sub run_job($) -{ - my $args = shift; - - my $db = MediaWords::DB::connect_to_db(); - - my $topics_id = $args->{ topics_id }; - my $import_only = $args->{ import_only } // 0; - my $cache_broken_downloads = $args->{ cache_broken_downloads } // 0; - my $skip_outgoing_foreign_rss_links = $args->{ skip_outgoing_foreign_rss_links } // 0; - my $skip_post_processing = $args->{ skip_post_processing } // 0; - my $test_mode = $args->{ test_mode } // 0; - my $snapshots_id = $args->{ snapshots_id } // undef; - - my $state_updater = $args->{ state_updater }; - - unless ( $topics_id ) - { - die "'topics_id' is not set."; - } - - unless ( $state_updater ) { - die "State updater is not set."; - } - - my $topic = $db->find_by_id( 'topics', $topics_id ) - or die( "Unable to find topic '$topics_id'" ); - - my $options = { - import_only => $import_only, - cache_broken_downloads => $cache_broken_downloads, - skip_outgoing_foreign_rss_links => $skip_outgoing_foreign_rss_links, - skip_post_processing => $skip_post_processing, - test_mode => $test_mode, - snapshots_id => $snapshots_id - }; - - MediaWords::TM::Mine::mine_topic( $db, $topic, $options, $state_updater ); -} - -sub start_topics_mine_worker($) -{ - my $queue_name = shift; - - my $app = MediaWords::Job::StatefulBroker->new( $queue_name ); - - my $lock = MediaWords::Job::Lock->new( - - # Define this here so that ::MineTopicPublic operates on the same lock - 'MediaWords::Job::TM::MineTopic', - - # Only run one job for each topic at a time - 'topics_id', - - ); - - my $extra_table = MediaWords::Job::State::ExtraTable->new( 'topics', 'state', 'message' ); - my $state = MediaWords::Job::State->new( $extra_table ); - $app->start_worker( \&run_job, $lock, $state ); -} - -1; diff --git a/apps/topics-mine/src/python/topics_mine/mine.py b/apps/topics-mine/src/python/topics_mine/mine.py new file mode 100644 index 0000000000..1588b3446d --- /dev/null +++ b/apps/topics-mine/src/python/topics_mine/mine.py @@ -0,0 +1,1150 @@ +""" +topic spider implementation + +this package implements the parent spider job, which runs the initial seed queries and then queues and +manages the children jobs to fetch and extract links, to fetch social media data, and so on. + +the topic mining process is described in doc/topic_mining.markdown. +""" + +import datetime +from dateutil.relativedelta import relativedelta +import random +from time import sleep, time +from typing import Optional, Callable + +from mediawords.db import DatabaseHandler +from mediawords.db.locks import get_session_lock, release_session_lock +import mediawords.dbi.stories +from mediawords.job import JobBroker, StatefulJobBroker, StateUpdater +import mediawords.solr +import mediawords.solr.query +import mediawords.util.sql +import topics_base.alert +import topics_base.stories +import topics_mine.fetch_topic_posts + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +# lock_type to send to get_session_lock +LOCK_TYPE = 'MediaWords::Job::TM::MineTopic' + +# total time to wait for fetching of social media metrics +MAX_SOCIAL_MEDIA_FETCH_TIME = (60 * 60 * 24) + +# add new links in chunks of this size +ADD_NEW_LINKS_CHUNK_SIZE = 10000 + +# extract story links in chunks of this size +EXTRACT_STORY_LINKS_CHUNK_SIZE = 1000 + +# query this many topic_links at a time to spider +SPIDER_LINKS_CHUNK_SIZE = 100000 + +# raise McTopicMineError if the error rate for link fetch or link extract jobs is greater than this +MAX_JOB_ERROR_RATE = 0.02 + +# timeout when polling for jobs to finish +JOB_POLL_TIMEOUT = 600 + +# number of seconds to wait when polling for jobs to finish +JOB_POLL_WAIT = 5 + +# if more than this many seed urls are imported, dedup stories before as well as after spidering +MIN_SEED_IMPORT_FOR_PREDUP_STORIES = 50000 + +# how many link extraction jobs per 1000 can we ignore if they hang +MAX_LINK_EXTRACTION_TIMEOUT = 10 + +# how long to wait to timeout link extraction +LINK_EXTRACTION_POLL_TIMEOUT = 600 + +# domain timeout for link fetching +DOMAIN_TIMEOUT = None + +class McTopicMineError(Exception): + pass + + +def update_topic_state(db: DatabaseHandler, state_updater: Optional[StateUpdater], message: str) -> None: + """ update topics.state in the database""" + + log.info("update topic state: message") + + if not state_updater: + # Shouldn't happen but let's just test it here + log.warning("State updater is unset.") + return + + state_updater.update_job_state_message(db, message) + + +def story_within_topic_date_range(topic: dict, story:dict) -> bool: + """return True if the publish date of the story is within 7 days of the topic date range or if it is undateable""" + + if not story['publish_date']: + return True + + story_date = (story['publish_date'])[0:10] + + start_date = topic['start_date'] + start_date = mediawords.util.sql.increment_day(start_date, -7) + start_date = start_date[0:10] + + end_date = topic['end_date'] + end_date = mediawords.util.sql.increment_day(end_date, 7) + end_date = end_date[0:10] + + return story_date >= start_date and story_date <= end_date + + +def generate_topic_links(db: DatabaseHandler, topic: dict, stories: list): + """submit jobs to extract links from the stories and then poll to wait for the stories to be processed""" + log.info(f"generate topic links: {len(stories)}") + + if len(stories) < 1: + return + + topic_links = [] + + if topic['platform'] != 'web': + log.info("skip link generation for non web topic") + return + + stories_ids_table = db.get_temporary_ids_table([s['stories_id'] for s in stories]) + + db.query( + f""" + update topic_stories set link_mined = 'f' + where + stories_id in (select id from {stories_ids_table}) and + topics_id = %(a)s and + link_mined = 't' + """, + {'a': topic['topics_id']}) + + queued_stories_ids = [] + for story in stories: + if not story_within_topic_date_range(topic, story): + continue + + queued_stories_ids.append(story['stories_id']) + + JobBroker(queue_name='MediaWords::Job::TM::ExtractStoryLinks').add_to_queue( + stories_id=story['stories_id'], + topics_id=topic['topics_id']) + + log.debug(f"queued link extraction for story {story['title']} {story['url']}.") + + log.info(f"waiting for {len(queued_stories_ids)} link extraction jobs to finish") + + queued_ids_table = db.get_temporary_ids_table(queued_stories_ids) + + # poll every JOB_POLL_WAIT seconds waiting for the jobs to complete. raise McTopicMineError if the number + # of stories left to process has not shrunk for EXTRACTION_POLL_TIMEOUT seconds. + prev_num_queued_stories = len(stories) + last_change_time = time() + while True: + queued_stories = db.query( + f""" + select stories_id from topic_stories + where stories_id in (select id from {queued_ids_table}) and topics_id = %(a)s and link_mined = 'f' + """, + {'a': topic['topics_id']}).flat() + + num_queued_stories = len(queued_stories) + + if not num_queued_stories: + break + + if num_queued_stories != prev_num_queued_stories: + last_change_time = time() + + if (time() - last_change_time) > LINK_EXTRACTION_POLL_TIMEOUT: + ids_list = ','.join(queued_stories) + if num_queued_stories > MAX_LINK_EXTRACTION_TIMEOUT: + raise McTopicMineError(f"Timed out waiting for story link extraction ({ids_list}).") + + db.query( + """ + update topic_stories set link_mine_error = 'time out' + where stories_id = any(%(b)s) and topics_id = %(a)s + """, + {'a': topic['topics_id'], 'b': queued_stories}) + + break + + log.info(f"{num_queued_stories} stories left in link extraction pool....") + + prev_num_queued_stories = num_queued_stories + sleep(JOB_POLL_WAIT) + + db.query( + f""" + update topic_stories set link_mined = 't' + where stories_id in (select id from {stories_ids_table}) and topics_id = %(a)s and link_mined = 'f' + """, + {'a': topic['topics_id']}) + + db.query(f"drop table {stories_ids_table}") + + +def die_if_max_stories_exceeded(db: DatabaseHandler, topic: dict) -> None: + """ + raise an MCTopicMineMaxStoriesException topic_stories > topics.max_stories. + """ + num_topic_stories = db.query( + "select count(*) from topic_stories where topics_id = %(a)s", + {'a': topic['topics_id']}).flat()[0] + + if num_topic_stories > topic['max_stories']: + raise McTopicMineError(f"{num_topic_stories} stories > {topic['max_stories']}") + + +def queue_topic_fetch_url(tfu: dict, domainm_timeout: Optional[int] = None): + """ add the topic_fetch_url to the fetch_link job queue. try repeatedly on failure.""" + + JobBroker(queue_name='MediaWords::Job::TM::FetchLink').add_to_queue( + topic_fetch_urls_id=tfu['topic_fetch_urls_id'], + domain_timeout=DOMAIN_TIMEOUT) + + +def create_and_queue_topic_fetch_urls(db: DatabaseHandler, topic: dict, fetch_links: list) -> list: + """ + create topic_fetch_urls rows correpsonding to the links and queue a FetchLink job for each. + + return the tfu rows. + """ + tfus = [] + for link in fetch_links: + topic_links_id = link.get('topic_links_id', None) + assume_match = link.get('assume_match', False) + + # if this link has an associated topics_link row but that row has been deleted, ignore it. + # this can be used to delete spam urls from topic_links during the spidering process. + if topic_links_id and not db.find_by_id('topic_links', topic_links_id): + continue + + tfu = { + 'topics_id': topic['topics_id'], + 'url': link['url'], + 'state': 'pending', + 'assume_match': assume_match, + 'topic_links_id': topic_links_id} + tfu = db.create('topic_fetch_urls', tfu) + + tfus.append(tfu) + + queue_topic_fetch_url(tfu) + + return tfus + + +def _fetch_twitter_urls(db: DatabaseHandler, topic: dict, tfu_ids: list) -> None: + """ + Send topic_fetch_urls to fetch_twitter_urls queue and wait for the jobs to complete. + """ + # we run into quota limitations sometimes and need a longer timeout + twitter_poll_timeout = JOB_POLL_TIMEOUT * 5 + + twitter_tfu_ids = db.query( + """ + select topic_fetch_urls_id + from topic_fetch_urls tfu + where + tfu.state = 'tweet pending' and + tfu.topic_fetch_urls_id = any(%(a)s) + """, {'a': tfu_ids}).flat() + + if not twitter_tfu_ids: + return + + tfu_ids_table = db.get_temporary_ids_table(twitter_tfu_ids) + + JobBroker(queue_name='MediaWords::Job::TM::FetchTwitterUrls').add_to_queue( + topic_fetch_urls_ids=twitter_tfu_ids) + + log.info(f"waiting for fetch twitter urls job for {len(twitter_tfu_ids)} urls") + + # poll every sleep_time seconds waiting for the jobs to complete. + # raise McTopicMineError if the number of stories left to process + # has not shrunk for large_timeout seconds. warn but continue if the number of stories left to process + # is only 5% of the total and short_timeout has passed (this is to make the topic not hang entirely because + # of one link extractor job error). + prev_num_queued_urls = len(twitter_tfu_ids) + last_change_time = time() + while True: + queued_tfus = db.query( + f""" + select tfu.* + from topic_fetch_urls tfu + join {tfu_ids_table} ids on (tfu.topic_fetch_urls_id = ids.id) + where + state in ('tweet pending') + """).hashes() + + num_queued_urls = len(queued_tfus) + + if num_queued_urls == 0: + break + + if num_queued_urls != prev_num_queued_urls: + last_change_time = time() + + if (time() - last_change_time) > twitter_poll_timeout: + raise McTopicMineError(f"Timed out waiting for twitter fetching {queued_tfus}") + + log.info(f"{num_queued_urls} twitter urls left to fetch ...") + + prev_num_queued_urls = num_queued_urls + sleep(JOB_POLL_WAIT) + + +def list_pending_urls(pending_urls: list) -> str: + """list a sample of the pending urls for fetching""" + num_pending_urls = len(pending_urls) + + num_printed_urls = min(num_pending_urls, 3) + + random.shuffle(pending_urls) + urls = pending_urls[0:num_printed_urls] + + return "\n".join([f"pending url: {url['url']} [{url['state']}: {url['fetch_date']}]" for url in urls]) + + +def fetch_links(db: DatabaseHandler, topic: dict, fetch_links: dict) -> None: + """ + fetch the given links by creating topic_fetch_urls rows and sending them to the FetchLink queue + for processing. wait for the queue to complete and return the resulting topic_fetch_urls. + """ + + log.info("fetch_links: queue links") + tfus = create_and_queue_topic_fetch_urls(db, topic, fetch_links) + num_queued_links = len(fetch_links) + + log.info(f"waiting for fetch link queue: {num_queued_links} queued") + + tfu_ids = [tfu['topic_fetch_urls_id'] for tfu in tfus] + + requeues = 0 + max_requeues = 1 + max_requeue_jobs = 100 + requeue_timeout = 30 + instant_requeued = 0 + + # once the pool is this small, just requeue everything with a 0 per site throttle + instant_queue_size = 25 + + # how many times to requeues everything if there is no change for JOB_POLL_TIMEOUT seconds + full_requeues = 0 + max_full_requeues = 1 + + last_pending_change = time() + last_num_pending_urls = 0 + while True: + pending_urls = db.query( + """ + select *, coalesce(fetch_date::text, 'null') fetch_date + from topic_fetch_urls + where + topic_fetch_urls_id = any(%(a)s) and + state in ('pending', 'requeued') + """, + {'a': tfu_ids}).hashes() + + pending_url_ids = [u['topic_fetch_urls_id'] for u in pending_urls] + + num_pending_urls = len(pending_url_ids) + + log.info(f"waiting for fetch link queue: {num_pending_urls} links remaining ...") + log.info(list_pending_urls(pending_urls)) + + if num_pending_urls < 1: + break + + # if we only have a handful of job left, requeue them all once with a 0 domain throttle + if not instant_requeued and num_pending_urls <= instant_queue_size: + instant_requeued = 1 + [queue_topic_fetch_url(db.require_by_id('topic_fetch_urls', id), 0) for id in pending_url_ids] + sleep(JOB_POLL_WAIT) + continue + + time_since_change = time() - last_pending_change + + # for some reason, the fetch_link queue is occasionally losing a small number of jobs. + if (time_since_change > requeue_timeout and + requeues < max_requeues and + num_pending_urls < max_requeue_jobs): + log.info(f"requeueing fetch_link {num_pending_urls} jobs ... [{requeue} requeues]") + + # requeue with a domain_timeout of 0 so that requeued urls can ignore throttling + [queue_topic_fetch_url(db.require_by_id('topic_fetch_urls', id), 0) for id in pending_url_ids] + requeues += 1 + last_pending_change = time() + + if time_since_change > JOB_POLL_TIMEOUT: + if num_pending_urls > max_requeue_jobs: + raise McTopicMineError("Timed out waiting for fetch link queue") + elif full_requeues < max_full_requeues: + [queue_topic_fetch_url(db.require_by_id('topic_fetch_urls', id)) for id in pending_url_ids] + full_requeues += 1 + last_pending_change = time() + else: + for id in pending_url_ids: + db.update_by_id('topic_fetch_urls', id, {'state': 'python error', 'message': 'timed out'}) + + log.info(f"timed out {len(pending_url_ids)} urls") + + if num_pending_urls < last_num_pending_urls: + last_pending_change = time() + + last_num_pending_urls = num_pending_urls + + sleep(JOB_POLL_WAIT) + + _fetch_twitter_urls(db, topic, tfu_ids) + + log.info("fetch_links: update topic seed urls") + db.query( + """ + update topic_seed_urls tsu + set stories_id = tfu.stories_id, processed = 't' + from topic_fetch_urls tfu + where + tfu.url = tsu.url and + tfu.stories_id is not null and + tfu.topic_fetch_urls_id = any(%(a)s) and + tfu.topics_id = tsu.topics_id + """, + {'a': tfu_ids}) + + completed_tfus = db.query( + "select * from topic_fetch_urls where topic_fetch_urls_id = any(%(a)s)", + {'a': tfu_ids}).hashes() + + log.info("completed fetch link queue") + + return completed_tfus + + +def add_new_links_chunk(db: DatabaseHandler, topic: dict, iteration: int, new_links: list) -> None: + """ + download any unmatched link in new_links, add it as a story, extract it, add any links to the topic_links list. + + each hash within new_links can either be a topic_links hash or simply a hash with a {url} field. if + the link is a topic_links hash, the topic_link will be updated in the database to point ref_stories_id + to the new link story. For each link, set the {story} field to the story found or created for the link. + """ + die_if_max_stories_exceeded(db, topic) + + log.info("add_new_links_chunk: fetch_links") + topic_fetch_urls = fetch_links(db, topic, new_links) + + log.info("add_new_links_chunk: mark topic links spidered") + link_ids = [l['topic_links_id'] for l in new_links if 'topic_links_id' in l] + + db.query( + "update topic_links set link_spidered = 't' where topic_links_id = any(%(a)s)", + {'a': link_ids}) + + +def save_metrics(db: DatabaseHandler, topic: dict, iteration: int, num_links: int, elapsed_time: int) -> None: + """save a row in the topic_spider_metrics table to track performance of spider""" + + topic_spider_metric = { + 'topics_id': topic['topics_id'], + 'iteration': iteration, + 'links_processed': num_links, + 'elapsed_time': elapsed_time + } + + db.create('topic_spider_metrics', topic_spider_metric) + + +def add_new_links(db:DatabaseHandler, topic:dict, iteration:int, new_links:list, state_updater:Callable) -> None: + """call add_new_links in chunks of ADD_NEW_LINKS_CHUNK_SIZE""" + log.info("add new links") + + if not new_links: + return + + spider_progress = get_spider_progress_description(db, topic, iteration, len(new_links)) + + num_links = len(new_links) + + i = 0 + while i < num_links: + start_time = time() + + update_topic_state(db, state_updater, f"spider_progress iteration links: {i} / {num_links}") + + chunk_links = new_links[i:i + ADD_NEW_LINKS_CHUNK_SIZE] + add_new_links_chunk(db, topic, iteration, chunk_links) + + elapsed_time = time() - start_time + save_metrics(db, topic, iteration, len(chunk_links), elapsed_time) + + i += ADD_NEW_LINKS_CHUNK_SIZE + + mine_topic_stories(db, topic) + + +def get_new_links(db: DatabaseHandler, iteration: int, topics_id: int) -> list: + """query the database for new links from stories below the given iteration.""" + + new_links = db.query( + """ + select tl.* + from + topic_links tl + join topic_stories ts using ( topics_id ) + where + tl.link_spidered = 'f' and + tl.stories_id = ts.stories_id and + (ts.iteration <= %(a)s or ts.iteration = 1000) and + ts.topics_id = %(b)s + + limit %(c)s + """, + {'a': iteration, 'b': topics_id, 'c': SPIDER_LINKS_CHUNK_SIZE}).hashes() + + return new_links + + +def spider_new_links( + db: DatabaseHandler, topic: dict, iteration: int, state_updater: Optional[StateUpdater]) -> None: + """call add_new_links on topic_links for which link_spidered is false.""" + + while True: + log.info("querying new links ...") + + db.query("drop table if exists _new_links") + + num_new_links = db.query( + """ + create temporary table _new_links as + select tl.* + from topic_links tl, topic_stories ts + where + tl.link_spidered = 'f' and + tl.stories_id = ts.stories_id and + (ts.iteration <= %(a)s or ts.iteration = 1000) and + ts.topics_id = %(b)s and + tl.topics_id = %(b)s + order by random() + """, + {'a': iteration, 'b': topic['topics_id']}).rows() + + db.query("create index _new_links_tl on _new_links (topic_links_id)") + + if num_new_links < 1: + break + + log.info(f"found {num_new_links} new links") + + while True: + new_links = db.query("select * from _new_links limit %(a)s", {'a': SPIDER_LINKS_CHUNK_SIZE}).hashes() + if not new_links: + break + + tl_ids = [n['topic_links_id'] for n in new_links] + db.query("delete from _new_links where topic_links_id = any(%(a)s)", {'a': tl_ids}) + add_new_links(db, topic, iteration, new_links, state_updater) + +def get_spider_progress_description(db: DatabaseHandler, topic: dict, iteration: int, total_links: int) -> str: + """get short text description of spidering progress""" + + log.info("get spider progress description") + + topics_id = topic['topics_id'] + + total_stories = db.query( + "select count(*) from topic_stories where topics_id = %(a)s", + {'a': topics_id}).flat()[0] + + stories_last_iteration = db.query( + "select count(*) from topic_stories where topics_id = %(a)s and iteration = %(b)s - 1", + {'a': topics_id, 'b': iteration}).flat()[0] + + queued_links = db.query( + "select count(*) from topic_links where topics_id = %(a)s and not link_spidered", + {'a': topics_id}).flat()[0] + + return ( + f"spidering iteration: {iteration} stories last iteration / total: " + f"{stories_last_iteration} / {total_stories} links queued: {queued_links} iteration links: {total_links}" + ) + + +def run_spider(db: DatabaseHandler, topic: dict, state_updater: Optional[StateUpdater]) -> None: + """run the spider over any new links, for num_iterations iterations""" + log.info("run spider") + + # before we run the spider over links, we need to make sure links have been generated for all existing stories + mine_topic_stories(db, topic) + + iterations = topic['max_iterations'] + [spider_new_links(db, topic, iterations, state_updater) for i in range(iterations)] + + +def mine_topic_stories(db: DatabaseHandler, topic: dict) -> None: + """ mine for links any stories in topic_stories for this topic that have not already been mined""" + log.info("mine topic stories") + + # skip for non-web topic, because the below query grows very large without ever mining links + if topic['platform'] != 'web': + log.info("skip link generation for non-web topic") + return + + # chunk the story extractions so that one big topic does not take over the entire queue + i = 0 + while True: + i += EXTRACT_STORY_LINKS_CHUNK_SIZE + log.info("mine topic stories: chunked i ...") + stories = db.query( + """ + select s.*, ts.link_mined, ts.redirect_url + from snap.live_stories s + join topic_stories ts on (s.stories_id = ts.stories_id and s.topics_id = ts.topics_id) + where + ts.link_mined = false and + ts.topics_id = %(a)s + limit %(b)s + """, {'a': topic['topics_id'], 'b': EXTRACT_STORY_LINKS_CHUNK_SIZE}).hashes() + + num_stories = len(stories) + + generate_topic_links(db, topic, stories) + + if num_stories < EXTRACT_STORY_LINKS_CHUNK_SIZE: + break + + +def import_seed_urls(db: DatabaseHandler, topic: dict, state_updater: Optional[StateUpdater]) -> int: + """ import all topic_seed_urls that have not already been processed + + return number of seed urls imported + """ + log.info("import seed urls") + + topics_id = topic['topics_id'] + + # take care of any seed urls with urls that we have already processed for this topic + db.query( + """ + update topic_seed_urls a set stories_id = b.stories_id, processed = 't' + from topic_seed_urls b + where a.url = b.url and + a.topics_id = %(a)s and b.topics_id = a.topics_id and + a.stories_id is null and b.stories_id is not null + """, + {'a': topics_id}) + + # randomly shuffle this query so that we don't block the extractor pool by throwing it all + # stories from a single media_id at once + seed_urls = db.query( + "select * from topic_seed_urls where topics_id = %(a)s and processed = 'f' order by random()", + {'a': topics_id}).hashes() + + if not seed_urls: + return 0 + + # process these in chunks in case we have to start over so that we don't have to redo the whole batch + num_urls = len(seed_urls) + i = 0 + while i < num_urls: + start_time = time() + + update_topic_state(db, state_updater, f"importing seed urls: {i} / {num_urls}") + + chunk_urls = seed_urls[i:i + ADD_NEW_LINKS_CHUNK_SIZE] + + # verify that the seed urls exist and not processed, in case we have mucked with them while spidering + url_ids = [u['topic_seed_urls_id'] for u in chunk_urls] + seed_urls_chunk = db.query( + "select * from topic_seed_urls where topic_seed_urls_id = any(%(a)s) and not processed", + {'a': url_ids}).hashes() + + add_new_links_chunk(db, topic, 0, seed_urls_chunk) + + url_ids = [u['topic_seed_urls_id'] for u in seed_urls_chunk] + + # update topic_seed_urls that were actually fetched + db.query( + """ + update topic_seed_urls tsu + set stories_id = tfu.stories_id + from topic_fetch_urls tfu + where + tsu.topics_id = tfu.topics_id and + md5(tsu.url) = md5(tfu.url) and + tsu.topic_seed_urls_id = any(%(a)s) + """, + {'a': url_ids}) + + # now update the topic_seed_urls that were matched + db.query( + """ + update topic_seed_urls tsu + set processed = 't' + where + tsu.topic_seed_urls_id = any(%(a)s) and + processed = 'f' + """, + {'a': url_ids}) + + elapsed_time = time() - start_time + save_metrics(db, topic, 1, len(chunk_urls), elapsed_time) + + i += ADD_NEW_LINKS_CHUNK_SIZE + + # cleanup any topic_seed_urls pointing to a merged story + db.query( + """ + UPDATE topic_seed_urls AS tsu + SET stories_id = tms.target_stories_id, processed = 't' + FROM topic_merged_stories_map AS tms, + topic_stories ts + WHERE tsu.stories_id = tms.source_stories_id + AND ts.stories_id = tms.target_stories_id + AND tsu.topics_id = ts.topics_id + AND ts.topics_id = %(a)s + """, + {'a': topic['topics_id']}) + + return len(seed_urls) + + +def insert_topic_seed_urls(db: DatabaseHandler, topic_seed_urls: list) -> None: + """ insert a list of topic seed urls""" + log.info(f"inserting {len(topic_seed_urls)} topic seed urls ...") + + for tsu in topic_seed_urls: + insert_tsu = {f: tsu[f] for f in ('stories_id', 'url', 'topics_id', 'assume_match')} + db.create('topic_seed_urls', insert_tsu) + + +def _import_month_within_respider_date(topic: dict, month_offset: int) -> bool: + """ return True if the given month offset is within the dates that should be respidered. + + always return True if there are no respider dates + """ + + start_date = topic['respider_start_date'] or '' + end_date = topic['respider_end_date'] or '' + + if not (topic['respider_stories'] and (start_date or end_date)): + return True + + month_date = datetime.datetime.strptime(topic['start_date'], '%Y-%m-%d') + relativedelta(months=month_offset) + log.warning(month_date) + + if end_date: + end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d') + relativedelta(months=-1) + log.warning(f"end_date: {end_date}") + if month_date > end_date: + return True + + if start_date: + start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d') + log.warning(f"start_date: {start_date}") + if month_date < start_date: + return True + + return False + + +def _search_for_stories_urls(db: DatabaseHandler, params: dict) -> list: + """Call search_solr_for_stories_ids() and then query postgres for the stories urls. + + Return dicts with stories_id and url fields.""" + + stories_ids = mediawords.solr.search_solr_for_stories_ids(db, params) + + stories = db.query("select stories_id,url from stories where stories_id = any(%(a)s)", {'a': stories_ids}).hashes() + + return stories + + +def import_solr_seed_query_month(db: DatabaseHandler, topic: dict, month_offset: int) -> bool: + """ import a single month of the solr seed query. we do this to avoid giant queries that timeout in solr. + + return True if the month_offset is valid for the topic.""" + if not topic['platform'] == 'web': + return False + + solr_query = mediawords.solr.query.get_full_solr_query_for_topic(db=db, topic=topic, month_offset=month_offset) + + # this should return undef once the month_offset gets too big + if not solr_query: + return False + + if not _import_month_within_respider_date(topic, month_offset): + return True + + max_stories = topic['max_stories'] + + # if solr maxes out on returned stories, it returns a few documents less than the rows= parameter, so we + # assume that we hit the solr max if we are within 5% of the max stories + max_returned_stories = max_stories * 0.95 + + log.info(f"import solr seed query month offset {month_offset}") + solr_query['rows'] = max_stories + + stories = _search_for_stories_urls(db, solr_query) + + if len(stories) > max_returned_stories: + raise McTopicMineError(f"solr_seed_query returned more than {max_returned_stories} stories") + + log.info(f"adding {len(stories)} stories to topic_seed_urls") + + topic_seed_urls = [] + for story in stories: + tsu = { + 'topics_id': topic['topics_id'], + 'url': story['url'], + 'stories_id': story['stories_id'], + 'assume_match': 'f'} + topic_seed_urls.append(tsu) + + insert_topic_seed_urls(db, topic_seed_urls) + + return True + + +def import_solr_seed_query(db: DatabaseHandler, topic: dict) -> None: + """ import stories into topic_seed_urls from solr by running topic['solr_seed_query'] against solr. + + if the solr query has already been imported, do nothing.""" + + log.info("import solr seed query") + + if topic['solr_seed_query_run']: + return + + month_offset = 0 + while import_solr_seed_query_month(db, topic, month_offset): + month_offset += 1 + pass + + db.query("update topics set solr_seed_query_run = 't' where topics_id = %(a)s", {'a': topic['topics_id']}) + + +def all_facebook_data_fetched(db: DatabaseHandler, topic: dict) -> bool: + """ return True if there are no stories without facebook data""" + + null_facebook_story = db.query( + """ + select 1 + from topic_stories cs + left join story_statistics ss on (cs.stories_id = ss.stories_id) + where + cs.topics_id = %(a)s and + ss.facebook_api_error is null and + ( + ss.stories_id is null or + ss.facebook_share_count is null or + ss.facebook_comment_count is null or + ss.facebook_api_collect_date is null + ) + limit 1 + """, + {'a': topic['topics_id']}).hash() + + return null_facebook_story is None + + +def _add_topic_stories_to_facebook_queue(db: DatabaseHandler, topic: dict) -> None: + """ add all topic stories without facebook data to the queue""" + topics_id = topic['topics_id'] + + stories = db.query( + """ + SELECT ss.*, cs.stories_id + FROM topic_stories cs + left join story_statistics ss on (cs.stories_id = ss.stories_id) + WHERE cs.topics_id = %(a)s + ORDER BY cs.stories_id + """, + {'a': topics_id}).hashes() + + if not stories: + log.debug("No stories found for topic 'topic['name']'") + + for ss in stories: + if (ss['facebook_api_error'] or + ss['facebook_api_collect_date'] is None or + ss['facebook_share_count'] is None or + ss['facebook_comment_count'] is None): + log.debug(f"Adding job for story {ss['stories_id']}") + args = {'stories_id': ss['stories_id']} + + JobBroker(queue_name='MediaWords::Job::Facebook::FetchStoryStats').add_to_queue( + stories_id=ss['stories_id']) + + +def fetch_social_media_data(db: DatabaseHandler, topic: dict) -> None: + """ send jobs to fetch facebook data for all stories that don't yet have it""" + + log.info("fetch social media data") + + cid = topic['topics_id'] + + _add_topic_stories_to_facebook_queue(db, topic) + + poll_wait = 30 + retries = int(MAX_SOCIAL_MEDIA_FETCH_TIME / poll_wait) + 1 + + for i in range(retries): + if all_facebook_data_fetched(db, topic): + return + sleep(poll_wait) + + raise McTopicMineError("Timed out waiting for social media data") + + +def check_job_error_rate(db: DatabaseHandler, topic: dict) -> None: + """ raise an error if error rate for link extraction or link fetching is too high""" + + log.info("check job error rate") + + fetch_stats = db.query( + """ + select count(*) num, (state = 'python error') as error + from topic_fetch_urls + where topics_id = %(a)s + group by (state = 'python error') + """, + {'a': topic['topics_id']}).hashes() + + num_fetch_errors = sum([s['num'] for s in fetch_stats if s['error']]) + num_fetch_successes = sum([s['num'] for s in fetch_stats if not s['error']]) + + fetch_error_rate = num_fetch_errors / (num_fetch_errors + num_fetch_successes + 1) + + log.info(f"Fetch error rate: {fetch_error_rate} ({num_fetch_errors} / {num_fetch_successes})") + + if fetch_error_rate > MAX_JOB_ERROR_RATE: + raise McTopicMineError(f"Fetch error rate of {fetch_error_rate} is greater than {MAX_JOB_ERROR_RATE}") + + link_stats = db.query( + """ + select count(*) num, (length( link_mine_error) > 0) as error + from topic_stories + where topics_id = %(a)s + group by (length(link_mine_error) > 0) + """, + {'a': topic['topics_id']}).hashes() + + num_link_errors = sum([s['num'] for s in link_stats if s['error']]) + num_link_successes = sum([s['num'] for s in link_stats if not s['error']]) + + link_error_rate = num_link_errors / (num_link_errors + num_link_successes + 1) + + log.info(f"Link error rate: {link_error_rate} ({num_link_errors} / {num_link_successes})") + + if link_error_rate > MAX_JOB_ERROR_RATE: + raise McTopicMineError(f"link error rate of {link_error_rate} is greater than {MAX_JOB_ERROR_RATE}") + + +def import_urls_from_seed_queries(db: DatabaseHandler, topic: dict, state_updater: Optional[StateUpdater]) -> None: + """ import urls from seed query """ + + topic_seed_queries = db.query( + "select * from topic_seed_queries where topics_id = %(a)s", + {'a': topic['topics_id']}).hashes() + + log.debug("import seed urls from solr") + update_topic_state(db, state_updater, "importing solr seed query") + import_solr_seed_query(db, topic) + + for tsq in topic_seed_queries: + tsq_dump = tsq['topic_seed_queries_id'] + fetcher = topics_mine.fetch_topic_posts.get_post_fetcher(tsq) + if not fetcher: + raise McTopicMineError(f"unable to import seed urls for platform/source of seed query: {tsq_dump}") + + log.debug(f"import seed urls from fetch_topic_posts:\n{tsq_dump}") + topics_mine.fetch_topic_posts.fetch_topic_posts(db, tsq) + + db.query( + """ + insert into topic_seed_urls + (url, topics_id, assume_match, source, topic_seed_queries_id, topic_post_urls_id) + select distinct + tpu.url, + tsq.topics_id, + false, + 'topic_seed_queries', + tsq.topic_seed_queries_id, + tpu.topic_post_urls_id + from + topic_post_urls tpu + join topic_posts tp using (topic_posts_id) + join topic_post_days tpd using (topic_post_days_id) + join topic_seed_queries tsq using (topic_seed_queries_id) + where + tsq.topics_id = %(a)s + on conflict (topic_post_urls_id) do nothing + """, + {'a': topic['topics_id']}) + + +def set_stories_respidering(db: DatabaseHandler, topic: dict, snapshots_id: int) -> None: + """ if the query or dates have changed, set topic_stories.link_mined to false so they will be respidered""" + + if not topic['respider_stories']: + return + + respider_start_date = topic['respider_start_date'] + respider_end_date = topic['respider_end_date'] + + if not respider_start_date and not respider_end_date: + db.query("update topic_stories set link_mined = 'f' where topics_id = %(a)s", {'a': topic['topics_id']}) + return + + if respider_start_date: + db.query( + """ + update topic_stories ts set link_mined = 'f' + from stories s + where + ts.stories_id = s.stories_id and + s.publish_date >= %(b)s and + s.publish_date <= %(a)s and + ts.topics_id = %(c)s + """, + {'a': respider_start_date, 'b': topic['start_date'], 'c': topic['topics_id']}) + + if snapshots_id: + db.update_by_id('snapshots', snapshots_id, {'start_date': topic['start_date']}) + db.query( + """ + update timespans set archive_snapshots_id = snapshots_id, snapshots_id = null + where snapshots_id = %(a)s and start_date < %(b)s + """, + {'a': snapshots_id, 'b': respider_start_date}) + + if respider_end_date: + db.query( + """ + update topic_stories ts set link_mined = 'f' + from stories s + where + ts.stories_id = s.stories_id and + s.publish_date >= %(a)s and + s.publish_date <= %(b)s and + ts.topics_id = %(c)s + """, + {'a': respider_end_date, 'b': topic['end_date'], 'c': topic['topics_id']}) + + if snapshots_id: + db.update_by_id('snapshots', snapshots_id, {'end_date': topic['end_date']}) + db.query( + """ + update timespans set archive_snapshots_id = snapshots_id, snapshots_id = null + where snapshots_id = %(a)s and end_date > %(b)s + """, + {'a': snapshots_id, 'b': respider_end_date}) + + db.update_by_id( + 'topics', + topic['topics_id'], + {'respider_stories': 'f', 'respider_start_date': None, 'respider_end_date': None}) + + +def do_mine_topic(db: DatabaseHandler, topic: dict, options: dict) -> None: + """ mine the given topic for links and to recursively discover new stories on the web. + + options: + import_only - only run import_seed_urls and import_solr_seed and exit + skip_post_processing - skip social media fetching and snapshotting + snapshots_id - associate topic with the given existing snapshot + state_updater - object that implements mediawords.job.StateUpdater + """ + [options.setdefault(f, None) for f in 'state_updater import_only skip_post_processing snapshots_id'.split()] + + state_updater = options['state_updater'] + + update_topic_state(db, state_updater, "importing seed urls") + import_urls_from_seed_queries(db, topic, state_updater) + + update_topic_state(db, state_updater, "setting stories respidering...") + set_stories_respidering(db, topic, options['snapshots_id']) + + # this may put entires into topic_seed_urls, so run it before import_seed_urls. + # something is breaking trying to call this perl. commenting out for time being since we only need + # this when we very rarely change the foreign_rss_links field of a media source - hal + # update_topic_state(db, state_updater, "merging foreign rss stories") + # topics_base.stories.merge_foreign_rss_stories(db, topic) + + update_topic_state(db, state_updater, "importing seed urls") + if import_seed_urls(db, topic, state_updater) > MIN_SEED_IMPORT_FOR_PREDUP_STORIES: + # merge dup stories before as well as after spidering to avoid extra spidering work + update_topic_state(db, state_updater, "merging duplicate stories") + topics_base.stories.find_and_merge_dup_stories(db, topic) + + if not options.get('import_only', False): + update_topic_state(db, state_updater, "running spider") + run_spider(db, topic, state_updater) + + check_job_error_rate(db, topic) + + # merge dup media and stories again to catch dups from spidering + update_topic_state(db, state_updater, "merging duplicate stories") + topics_base.stories.find_and_merge_dup_stories(db, topic) + + update_topic_state(db, state_updater, "merging duplicate media stories") + topics_base.stories.merge_dup_media_stories(db, topic) + + if not options.get('skip_post_processing', False): + update_topic_state(db, state_updater, "fetching social media data") + fetch_social_media_data(db, topic) + + update_topic_state(db, state_updater, "snapshotting") + snapshot_args = {'topics_id': topic['topics_id'], 'snapshots_id': options['snapshots_id']} + StatefulJobBroker(queue_name='MediaWords::Job::TM::SnapshotTopic').add_to_queue(snapshot_args) + + +def mine_topic(db: DatabaseHandler, topic: dict, **options: dict) -> None: + """ wrap do_mine_topic in try and handle errors and state""" + + # the topic spider can sit around for long periods doing solr queries, so we need to make sure the postgres + # connection does not get timed out + db.query("set idle_in_transaction_session_timeout = 0") + + if topic['state'] != 'running': + topics_base.alert.send_topic_alert(db, topic, "started topic spidering") + + get_session_lock(db=db, lock_type=LOCK_TYPE, lock_id=topic['topics_id']) + + try: + do_mine_topic(db, topic, options) + except Exception as e: + topics_base.alert.send_topic_alert(db, topic, "aborted topic spidering due to error") + raise e + + release_session_lock(db=db, lock_type=LOCK_TYPE, lock_id=topic['topics_id']) + + +def run_worker_job(topics_id: int, snapshots_id: Optional[int] = None) -> None: + """run a topics-mine worker job.""" + if isinstance(snapshots_id, bytes): + snapshots_id = decode_object_from_bytes_if_needed(snapshots_id) + if snapshots_id is not None: + snapshots_id = int(snapshots_id) + + if isinstance(topics_id, bytes): + topics_id = decode_object_from_bytes_if_needed(topics_id) + if topics_id is not None: + topics_id = int(topics_id) + + if not bool(topics_id): + raise McTopicMineException("topics_id must be set") + + db = connect_to_db() + + topic = db.require_by_id('topics', topics_id) + + mine_topic(db=db, topic=topic, snapshots_id=snapshots_id) diff --git a/apps/topics-mine/src/python/topics_mine/test.py b/apps/topics-mine/src/python/topics_mine/test.py new file mode 100644 index 0000000000..efb2d6b156 --- /dev/null +++ b/apps/topics-mine/src/python/topics_mine/test.py @@ -0,0 +1,47 @@ +import mediawords.db +from mediawords.test.db.create import create_test_topic +from mediawords.test.solr import create_test_story_stack_for_indexing, setup_test_index +import topics_mine.mine + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def create_topic_for_import(db: mediawords.db.DatabaseHandler, num_stories : int = 200) -> dict: + """create a test topic and stories for import into the topic. + + return the topic. + """ + topic = create_test_topic(db, 'import') + + stack = {'medium_1': {'feed_1': [f"story_{_}" for _ in range(num_stories)]}} + create_test_story_stack_for_indexing(db, stack) + + all_media = db.query("select * from media").hashes() + all_stories = db.query("select * from stories").hashes() + + topic['start_date'] = '2020-01-01' + topic['end_date'] = '2020-06-01' + topic['solr_seed_query'] = '*:*' + topic['solr_seed_query_run'] = False + + db.update_by_id('topics', topic['topics_id'], topic) + + for m in all_media: + db.query( + "insert into topics_media_map (topics_id, media_id) values (%(a)s, %(b)s)", + {'a': topic['topics_id'], 'b': m['media_id']}) + + # distribute one story each day. this is kludgy but should work from a fresh databse with + # sequential stories_ids. assumes that there are more stories than days in the date range above + stories = db.query("select * from stories").hashes() + for (i, story) in enumerate(stories): + db.query( + """ + update stories set publish_date = %(a)s::timestamp + ((%(b)s || ' days')::interval) + where stories_id = %(c)s + """, + {'a': topic['start_date'], 'b': i, 'c': story['stories_id']}) + + setup_test_index(db) + + return topic diff --git a/apps/topics-mine/src/requirements.txt b/apps/topics-mine/src/requirements.txt new file mode 100644 index 0000000000..7d5173a96e --- /dev/null +++ b/apps/topics-mine/src/requirements.txt @@ -0,0 +1,2 @@ +# test text generation +lorem diff --git a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/AddTestTopicStories.pm b/apps/topics-mine/tests/perl/MediaWords/TM/Mine/AddTestTopicStories.pm deleted file mode 100644 index 643bd6860b..0000000000 --- a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/AddTestTopicStories.pm +++ /dev/null @@ -1,27 +0,0 @@ -package AddTestTopicStories; - -use strict; -use warnings; - -use MediaWords::CommonLibs; -use MediaWords::Test::DB::Create; -use MediaWords::TM::Stories; - -my $_topic_stories_medium_count = 0; - -sub add_test_topic_stories($$$$) -{ - my ( $db, $topic, $num_stories, $label ) = @_; - - my $medium = MediaWords::Test::DB::Create::create_test_medium( $db, "$label " . $_topic_stories_medium_count++ ); - my $feed = MediaWords::Test::DB::Create::create_test_feed( $db, $label, $medium ); - - for my $i ( 1 .. $num_stories ) - { - my $story = MediaWords::Test::DB::Create::create_test_story( $db, "$label $i", $feed ); - MediaWords::TM::Stories::add_to_topic_stories( $db, $story, $topic ); - $db->update_by_id( 'stories', $story->{ stories_id }, { publish_date => $topic->{ start_date } } ); - } -} - -1; diff --git a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_die_if_max_stories_exceeded.t b/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_die_if_max_stories_exceeded.t deleted file mode 100755 index eb4f2bbdb8..0000000000 --- a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_die_if_max_stories_exceeded.t +++ /dev/null @@ -1,52 +0,0 @@ -use strict; -use warnings; - -use Test::Deep; -use Test::More tests => 3; - -use MediaWords::CommonLibs; -use MediaWords::DB; -use MediaWords::Test::DB::Create; -use MediaWords::TM::Mine; - -use FindBin; -use lib $FindBin::Bin; - -use AddTestTopicStories; - -sub test_die_if_max_stories_exceeded($) -{ - my ( $db ) = @_; - - my $label = "test_die_if_max_stories_exceeded"; - - my $topic = MediaWords::Test::DB::Create::create_test_topic( $db, $label ); - - $topic = $db->update_by_id( 'topics', $topic->{ topics_id }, { max_stories => 0 } ); - - AddTestTopicStories::add_test_topic_stories( $db, $topic, 101, $label ); - - eval { MediaWords::TM::Mine::die_if_max_stories_exceeded( $db, $topic ); }; - ok( $@, "$label adding 101 stories to 0 max_stories topic generates error" ); - - $db->query( "delete from topic_stories where topics_id = ?", $topic->{ topics_id } ); - - $topic = $db->update_by_id( 'topics', $topic->{ topics_id }, { max_stories => 100 } ); - - AddTestTopicStories::add_test_topic_stories( $db, $topic, 99, $label ); - eval { MediaWords::TM::Mine::die_if_max_stories_exceeded( $db, $topic ); }; - ok( !$@, "$label adding 999 stories to a 100 max_stories does not generate an error: $@" ); - - AddTestTopicStories::add_test_topic_stories( $db, $topic, 102, $label ); - eval { MediaWords::TM::Mine::die_if_max_stories_exceeded( $db, $topic ); }; - ok( $@, "$label adding 2001 stories to a 100 max_stories generates an error" ); -} - -sub main -{ - my $db = MediaWords::DB::connect_to_db(); - - test_die_if_max_stories_exceeded( $db ); -} - -main(); diff --git a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_import_urls_from_seed_query.t b/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_import_urls_from_seed_query.t deleted file mode 100644 index a8c3b29998..0000000000 --- a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_import_urls_from_seed_query.t +++ /dev/null @@ -1,87 +0,0 @@ -use strict; -use warnings; - -use Test::Deep; -use Test::More; - -use MediaWords::CommonLibs; -use MediaWords::DB; -use MediaWords::Test::DB::Create; -use MediaWords::TM::Mine; - -use FindBin; -use lib $FindBin::Bin; - -sub test_import_urls_from_seed_queries($) -{ - my ( $db ) = @_; - - my $label = "test_import"; - - my $topic = MediaWords::Test::DB::Create::create_test_topic( $db, $label ); - - $topic->{ start_date } = '2019-01-01'; - $topic->{ end_date } = '2019-02-01'; - $topic->{ platform } = 'generic_post'; - $topic->{ mode } = 'web'; - $topic->{ pattern } = 'foo'; - - $topic = $db->update_by_id( 'topics', $topic->{ topics_id }, $topic ); - - # posts.append({ - # 'post_id': post_id, - # 'content': "sample post for id id %s" % test_url, - # 'publish_date': publish_date, - # 'url': test_url, - # 'author': 'user-%s' % user_id, - # 'channel': 'channel-%s' % user_id, - # }) - my $posts_csv = < $topic->{ topics_id }, - source => 'csv', - platform => 'generic_post', - query => $posts_csv - }; - my $topic_seed_query = $db->create( 'topic_seed_queries', $topic_seed_query_data ); - - $topic_seed_query_data->{ query } = $posts_2_csv; - my $topic_seed_query_2 = $db->create( 'topic_seed_queries', $topic_seed_query_data ); - - MediaWords::TM::Mine::import_urls_from_seed_queries( $db, $topic ); - - my $topic_posts = $db->query( <{ topics_id } )->hashes(); -select * - from topic_posts tp - join topic_post_days tpd using ( topic_post_days_id ) - join topic_seed_queries tsq using ( topic_seed_queries_id ) - where topics_id = ? -SQL - - is ( scalar( @{ $topic_posts } ), 2, "number of topic posts" ); - - my $tsus = $db->query( "select * from topic_seed_urls where topics_id = ?", $topic->{ topics_id } )->hashes(); - - is( scalar( @{ $tsus } ), 2, "number of seed urls" ); - -} - -sub main -{ - my $db = MediaWords::DB::connect_to_db(); - - test_import_urls_from_seed_queries( $db ); - - done_testing(); -} - -main(); diff --git a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_respider.t b/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_respider.t deleted file mode 100755 index 019e075fd9..0000000000 --- a/apps/topics-mine/tests/perl/MediaWords/TM/Mine/test_respider.t +++ /dev/null @@ -1,122 +0,0 @@ -use strict; -use warnings; - -use Test::Deep; -use Test::More tests => 4; - -use MediaWords::CommonLibs; -use MediaWords::DB; -use MediaWords::Test::DB::Create; -use MediaWords::TM::Mine; - -use FindBin; -use lib $FindBin::Bin; - -use AddTestTopicStories; - -sub test_respider($) -{ - my ( $db ) = @_; - - my $label = "test_respider"; - - my $topic = MediaWords::Test::DB::Create::create_test_topic( $db, $label ); - - $topic->{ start_date } = '2017-01-01'; - $topic->{ end_date } = '2018-01-01'; - - $topic = $db->update_by_id( - 'topics', - $topic->{ topics_id }, - { max_stories => 0, start_date => '2017-01-01', end_date => '2018-01-01' } - ); - - my $num_topic_stories = 101; - AddTestTopicStories::add_test_topic_stories( $db, $topic, $num_topic_stories, $label ); - - # no respidering without respider_stories - $db->query( "update topic_stories set link_mined = 't'" ); - - MediaWords::TM::Mine::set_stories_respidering( $db, $topic, undef ); - - my ( $got_num_respider_stories ) = $db->query( "select count(*) from topic_stories where not link_mined" )->flat; - is( $got_num_respider_stories, 0, "no stories marked for respidering" ); - - # respider everything with respider_stories but no dates - $topic->{ respider_stories } = 1; - - $db->query( "update topic_stories set link_mined = 't'" ); - - MediaWords::TM::Mine::set_stories_respidering( $db, $topic, undef ); - - ( $got_num_respider_stories ) = $db->query( "select count(*) from topic_stories where not link_mined" )->flat; - is( $got_num_respider_stories, $num_topic_stories, "all stories marked for respidering" ); - - # respider stories within the range of changed dates - my $topic_update = { - respider_stories => 't', - respider_end_date => $topic->{ end_date }, - respider_start_date => $topic->{ start_date }, - end_date => '2019-01-01', - start_date => '2016-01-01', - }; - $topic = $db->update_by_id( 'topics', $topic->{ topics_id }, $topic_update ); - - $db->query( "update topic_stories set link_mined = 't'" ); - - my $num_date_changes = 10; - $db->query( "update stories set publish_date = '2017-06-01'" ); - $db->query( <query( < $topic->{ topics_id }, - snapshot_date => MediaWords::Util::SQL::sql_now(), - start_date => $topic->{ start_date }, - end_date => $topic->{ end_date } - }; - $snapshot = $db->create( 'snapshots', $snapshot ); - - my $timespan_dates = - [ [ '2017-01-01', '2017-01-31' ], [ '2017-12-20', '2018-01-20' ], [ '2016-12-20', '2017-01-20' ] ]; - for my $dates ( @{ $timespan_dates } ) - { - my ( $start_date, $end_date ) = @{ $dates }; - my $timespan = { - snapshots_id => $snapshot->{ snapshots_id }, - start_date => $start_date, - end_date => $end_date, - period => 'monthly', - story_count => 0, - story_link_count => 0, - medium_count => 0, - medium_link_count => 0, - post_count => 0 - }; - $timespan = $db->create( 'timespans', $timespan ); - } - - MediaWords::TM::Mine::set_stories_respidering( $db, $topic, $snapshot->{ snapshots_id } ); - - ( $got_num_respider_stories ) = $db->query( "select count(*) from topic_stories where not link_mined" )->flat; - is( $got_num_respider_stories, 2 * $num_date_changes, "dated stories marked for respidering" ); - - my ( $got_num_archived_timespans ) = - $db->query( "select count(*) from timespans where archive_snapshots_id = ?", $snapshot->{ snapshots_id } )->flat; - is( $got_num_archived_timespans, 2, "number of archive timespans" ); -} - -sub main -{ - my $db = MediaWords::DB::connect_to_db(); - - test_respider( $db ); -} - -main(); diff --git a/apps/topics-mine/tests/perl/test_cd_live_stories.t b/apps/topics-mine/tests/perl/test_cd_live_stories.t deleted file mode 100644 index f1faa7b413..0000000000 --- a/apps/topics-mine/tests/perl/test_cd_live_stories.t +++ /dev/null @@ -1,183 +0,0 @@ -use strict; -use warnings; - -# test that inserts and updates on stories in topic_stories are correctly mirrored to snap.live_stories - -use English '-no_match_vars'; - -use Test::More tests => 14; -use Test::Deep; - -use MediaWords::DB; -use MediaWords::Util::SQL; - -BEGIN -{ - use_ok( 'MediaWords::DB' ); -} - -sub add_topic_story -{ - my ( $db, $topic, $story ) = @_; - - $db->create( 'topic_stories', { stories_id => $story->{ stories_id }, topics_id => $topic->{ topics_id } } ); -} - -sub test_live_story_matches -{ - my ( $db, $topic, $story, $test_label ) = @_; - - my $live_story = $db->query( <{ topics_id }, $story->{ stories_id } )->hash; -select * from snap.live_stories where topics_id = ? and stories_id = ? -END - - delete( $live_story->{ topics_id } ); - delete( $live_story->{ topic_stories_id } ); - - $live_story->{ publish_date } =~ s/T/ /g; - $live_story->{ collect_date } =~ s/T/ /g; - $story->{ publish_date } =~ s/T/ /g; - $story->{ collect_date } =~ s/T/ /g; - - cmp_deeply( $live_story, $story, "$test_label: $story->{ title } should be in $topic->{ name } and match story" ); -} - -sub test_live_story_absent -{ - my ( $db, $topic, $story, $test_label ) = @_; - - my $live_story = $db->query( <{ topics_id }, $story->{ stories_id } )->hash; -select * from snap.live_stories where topics_id = ? and stories_id = ? -END - is( $live_story, undef, "$test_label: \$story->{ title } should be absent from \$topic->{ title }" ); -} - -sub update_story -{ - my ( $db, $story ) = @_; - - $story->{ url } ||= '/' . rand(); - $story->{ guid } ||= '/' . rand(); - $story->{ title } ||= ' ' . rand(); - $story->{ description } ||= ' ' . rand(); - $story->{ publish_date } = MediaWords::Util::SQL::get_sql_date_from_epoch( time() - int( rand( 100000 ) ) ); - $story->{ collect_date } = MediaWords::Util::SQL::get_sql_date_from_epoch( time() - int( rand( 100000 ) ) ); - - $db->update_by_id( 'stories', $story->{ stories_id }, $story ); - - return $db->find_by_id( 'stories', $story->{ stories_id } ); -} - -sub test_live_stories -{ - my ( $db ) = @_; - - my $medium = { - name => "test live stories", - url => "url://test/live/stories", - }; - $medium = $db->create( 'media', $medium ); - - my $topic_a = { - name => 'topic a', - pattern => '', - solr_seed_query => '', - solr_seed_query_run => 'f', - description => 'topic A', - start_date => '2017-01-01', - end_date => '2017-02-01', - job_queue => 'mc', - max_stories => 100_000, - platform => 'web' - }; - $topic_a = $db->create( 'topics', $topic_a ); - - my $topic_b = { - name => 'topic b', - pattern => '', - solr_seed_query => '', - solr_seed_query_run => 'f', - description => 'topic B', - start_date => '2017-01-01', - end_date => '2017-02-01', - job_queue => 'mc', - max_stories => 100_000, - platform => 'web' - }; - $topic_b = $db->create( 'topics', $topic_b ); - - my $story_a = { - media_id => $medium->{ media_id }, - url => 'url://story/a', - guid => 'guid://story/a', - title => 'story a', - description => 'description a', - publish_date => MediaWords::Util::SQL::get_sql_date_from_epoch( time() - 100000 ), - collect_date => MediaWords::Util::SQL::get_sql_date_from_epoch( time() - 200000 ), - full_text_rss => 't' - }; - $story_a = $db->create( 'stories', $story_a ); - - my $story_b = { - media_id => $medium->{ media_id }, - url => 'url://story/b', - guid => 'guid://story/b', - title => 'story b', - description => 'description b', - publish_date => MediaWords::Util::SQL::get_sql_date_from_epoch( time() - 300000 ), - collect_date => MediaWords::Util::SQL::get_sql_date_from_epoch( time() - 400000 ), - full_text_rss => 'f' - }; - $story_b = $db->create( 'stories', $story_b ); - - my $story_c = { - media_id => $medium->{ media_id }, - url => 'url://story/c', - guid => 'guid://story/c', - title => 'story c', - description => 'description c', - publish_date => MediaWords::Util::SQL::get_sql_date_from_epoch( time() - 500000 ), - collect_date => MediaWords::Util::SQL::get_sql_date_from_epoch( time() - 600000 ), - full_text_rss => 'f' - }; - $story_c = $db->create( 'stories', $story_c ); - - my $live_story = $db->query( "select * from snap.live_stories" )->hash; - is( $live_story, undef, "live stories empty before cs insert" ); - - add_topic_story( $db, $topic_a, $story_a ); - add_topic_story( $db, $topic_b, $story_b ); - add_topic_story( $db, $topic_a, $story_c ); - add_topic_story( $db, $topic_b, $story_c ); - - test_live_story_matches( $db, $topic_a, $story_a, "after insert" ); - test_live_story_absent( $db, $topic_b, $story_a, "after insert" ); - - test_live_story_matches( $db, $topic_b, $story_b, "after insert" ); - test_live_story_absent( $db, $topic_a, $story_b, "after insert" ); - - test_live_story_matches( $db, $topic_a, $story_c, "after insert" ); - test_live_story_matches( $db, $topic_b, $story_c, "after insert" ); - - $story_a = update_story( $db, $story_a ); - $story_b = update_story( $db, $story_b ); - $story_c = update_story( $db, $story_c ); - - test_live_story_matches( $db, $topic_a, $story_a, "after update" ); - test_live_story_absent( $db, $topic_b, $story_a, "after update" ); - - test_live_story_matches( $db, $topic_b, $story_b, "after update" ); - test_live_story_absent( $db, $topic_a, $story_b, "after update" ); - - test_live_story_matches( $db, $topic_a, $story_c, "after update" ); - test_live_story_matches( $db, $topic_b, $story_c, "after update" ); -} - -sub main -{ - my $db = MediaWords::DB::connect_to_db(); - - test_live_stories( $db ); -} - -main(); diff --git a/apps/topics-mine/tests/perl/test_import_month_within_respider_date.t b/apps/topics-mine/tests/perl/test_import_month_within_respider_date.t deleted file mode 100644 index 091b27c979..0000000000 --- a/apps/topics-mine/tests/perl/test_import_month_within_respider_date.t +++ /dev/null @@ -1,56 +0,0 @@ -use strict; -use warnings; - -# test TM::Mine::_import_month_within_respider_date - -use English '-no_match_vars'; - -use Test::More; - -use MediaWords::TM::Mine; - -sub test_import_month_within_respider_date() -{ - my $topic = { - start_date => '2019-01-01', - end_date => '2019-06-01', - respider_stories => 'f', - respider_start_date => undef, - respider_end_date => undef - }; - - # if none of the respider setting are correct, we should always return true - ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 0 ) ); - ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 1 ) ); - ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 100 ) ); - - # if respider_stories is true but neither respider date is set, always return true - $topic->{ respider_stories } = 1; - ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 0 ) ); - ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 1 ) ); - ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 100 ) ); - - # should only import the dates after the respider end date - $topic->{ respider_end_date } = '2019-05-01'; - ok( !MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 0 ) ); - ok( !MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 3 ) ); - ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 4 ) ); - - # make sure we capture the whole previous month if the end date is within a month - $topic->{ respider_end_date } = '2019-04-02'; - ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 3 ) ); - - # should only import the dates before the repsider start date - $topic->{ respider_start_date } = '2019-02-01'; - ok( MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 0 ) ); - ok( !MediaWords::TM::Mine::_import_month_within_respider_date( $topic, 1 ) ); -} - -sub main -{ - test_import_month_within_respider_date(); - - done_testing(); -} - -main(); diff --git a/apps/topics-mine/tests/perl/test_tm_mine.t b/apps/topics-mine/tests/perl/test_tm_mine.t deleted file mode 100644 index 2f3d96805a..0000000000 --- a/apps/topics-mine/tests/perl/test_tm_mine.t +++ /dev/null @@ -1,557 +0,0 @@ -use strict; -use warnings; - -# basic intergration test for topic mapper's spider - -use Modern::Perl "2015"; -use MediaWords::CommonLibs; - -use English '-no_match_vars'; - -use Data::Dumper; -use Digest::MD5 qw(md5_hex); -use MediaWords::Test::HashServer; -use Readonly; -use Sys::Hostname; -use Test::More; -use Text::Lorem::More; - -use MediaWords::DB; -use MediaWords::TM::Mine; -use MediaWords::Util::SQL; -use MediaWords::Util::Web; - -Readonly my $BASE_PORT => 8890; - -Readonly my $NUM_SITES => 5; -Readonly my $NUM_PAGES_PER_SITE => 10; -Readonly my $NUM_LINKS_PER_PAGE => 2; - -Readonly my $TOPIC_PATTERN => 'FOOBARBAZ'; - -sub get_html_link($) -{ - my ( $page ) = @_; - - my $lorem = Text::Lorem::More->new(); - - if ( 0 && int( rand( 3 ) ) ) - { - return "" . $lorem->words( 2 ) . ""; - } - else - { - return $page->{ url }; - } -} - -sub generate_content_for_site($) -{ - my ( $site ) = @_; - - my $lorem = Text::Lorem::More->new(); - - my $body = $lorem->sentences( 5 ); - - return < - - $site->{ title } - - -

- $body -

- - -HTML -} - -sub generate_content_for_page($$) -{ - my ( $site, $page ) = @_; - - my $lorem = Text::Lorem::More->new(); - - my $num_links = scalar( @{ $page->{ links } } ); - my $num_paragraphs = int( rand( 10 ) + 3 ) + $num_links; - - my $paragraphs = []; - - for my $i ( 0 .. $num_paragraphs - 1 ) - { - my $text = $lorem->sentences( 5 ); - if ( $i < $num_links ) - { - my $html_link = get_html_link( $page->{ links }->[ $i ] ); - $text .= " $html_link"; - } - - push( @{ $paragraphs }, $text ); - } - - if ( rand( 2 ) < 1 ) - { - push( @{ $paragraphs }, $lorem->words( 10 ) . " $TOPIC_PATTERN" ); - $page->{ matches_topic } = 1; - } - - my $dead_link_text = $lorem->sentences( 5 ); - $dead_link_text .= " dead link"; - - push( @{ $paragraphs }, $dead_link_text ); - - my $body = join( "\n\n", map { "

\n$_\n

" } @{ $paragraphs } ); - - return < - - $page->{ title } - - - $body - - -HTML - -} - -sub generate_content_for_sites($) -{ - my ( $sites ) = @_; - - for my $site ( @{ $sites } ) - { - $site->{ content } = generate_content_for_site( $site ); - - for my $page ( @{ $site->{ pages } } ) - { - $page->{ content } = generate_content_for_page( $site, $page ); - } - } -} - -# generate test set of sites -sub get_test_sites() -{ - my $sites = []; - my $pages = []; - - # my $base_port = $BASE_PORT + int( rand( 200 ) ); - my $base_port = $BASE_PORT; - - for my $site_id ( 0 .. $NUM_SITES - 1 ) - { - my $port = $base_port + $site_id; - - my $site = { - port => $port, - id => $site_id, - - # Other containers will access this host to we have to set the - # actual hostname instead of just localhost - url => "http://" . Sys::Hostname::hostname . ":$port/", - - title => "site $site_id" - }; - - my $num_pages = int( rand( $NUM_PAGES_PER_SITE ) ) + 1; - for my $page_id ( 0 .. $num_pages - 1 ) - { - my $date = MediaWords::Util::SQL::get_sql_date_from_epoch( time() - ( rand( 365 ) * 86400 ) ); - - my $path = "page-$page_id"; - - my $page = { - id => $page_id, - path => "/$path", - url => "$site->{ url }$path", - title => "page $page_id", - pubish_date => $date, - links => [] - }; - - push( @{ $pages }, $page ); - push( @{ $site->{ pages } }, $page ); - } - - push( @{ $sites }, $site ); - } - - my $all_pages = []; - map { push( @{ $all_pages }, @{ $_->{ pages } } ) } @{ $sites }; - for my $page ( @{ $all_pages } ) - { - my $num_links = int( rand( $NUM_LINKS_PER_PAGE ) ); - for my $link_id ( 0 .. $num_links - 1 ) - { - my $linked_page_id = int( rand( scalar( @{ $all_pages } ) ) ); - my $linked_page = $all_pages->[ $linked_page_id ]; - - unless ( MediaWords::Util::URL::urls_are_equal( $page->{ url }, $linked_page->{ url } ) ) - { - push( @{ $page->{ links } }, $linked_page ); - } - } - } - - generate_content_for_sites( $sites ); - - return $sites; -} - -# add a medium for each site so that the topic mapper's spider can find the medium that corresponds to each url -sub add_site_media($$) -{ - my ( $db, $sites ) = @_; - - for my $site ( @{ $sites } ) - { - $site->{ medium } = $db->create( - 'media', - { - url => $site->{ url }, - name => $site->{ title }, - } - ); - } -} - -sub start_hash_servers($) -{ - my ( $sites ) = @_; - - my $hash_servers = []; - - for my $site ( @{ $sites } ) - { - my $site_hash = {}; - - $site_hash->{ '/' } = $site->{ content }; - - map { $site_hash->{ $_->{ path } } = $_->{ content } } @{ $site->{ pages } }; - - my $hs = MediaWords::Test::HashServer->new( $site->{ port }, $site_hash ); - - DEBUG "starting hash server $site->{ id }"; - - $hs->start(); - - push( @{ $hash_servers }, $hs ); - } - - # wait for the hash servers to start - sleep( 1 ); - - return $hash_servers; -} - -sub test_page($$$) -{ - my ( $label, $url, $expected_content ) = @_; - - TRACE "test page: $label $url"; - - my $ua = MediaWords::Util::Web::UserAgent->new(); - my $request = MediaWords::Util::Web::UserAgent::Request->new( 'GET', $url ); - my $response = $ua->request( $request ); - - ok( $response->is_success, "request success: $label $url" ); - - my $got_content = $response->decoded_content; - - TRACE "got content"; - - is( $got_content, $expected_content, "simple page test: $label" ); -} - -sub test_pages($) -{ - my ( $sites ) = @_; - - for my $site ( @{ $sites } ) - { - DEBUG "testing pages for site $site->{ id }"; - test_page( "site $site->{ id }", $site->{ url }, $site->{ content } ); - - map { test_page( "page $site->{ id } $_->{ id }", $_->{ url }, $_->{ content } ) } @{ $site->{ pages } }; - } -} - -sub seed_unlinked_urls($$$) -{ - my ( $db, $topic, $sites ) = @_; - - my $all_pages = []; - map { push( @{ $all_pages }, @{ $_->{ pages } } ) } @{ $sites }; - - # do not seed urls that are linked directly from a page that is a topic match. - # this forces the test to succesfully discover those pages through spidering. - my $non_seeded_url_lookup = {}; - for my $page ( @{ $all_pages } ) - { - if ( $page->{ matches_topic } ) - { - map { $non_seeded_url_lookup->{ $_->{ url } } = 1 } @{ $page->{ links } }; - } - } - - my $seed_pages = []; - for my $page ( @{ $all_pages } ) - { - if ( $non_seeded_url_lookup->{ $page->{ url } } ) - { - DEBUG( "non seeded url: $page->{ url }" ); - } - else - { - DEBUG( "seed url: $page->{ url }" ); - push( @{ $seed_pages }, $page ); - } - } - - for my $seed_page ( @{ $all_pages } ) - { - $db->create( - 'topic_seed_urls', - { - topics_id => $topic->{ topics_id }, - url => $seed_page->{ url } - } - ); - } -} - -sub create_topic($$) -{ - my ( $db, $sites ) = @_; - - my $now = MediaWords::Util::SQL::sql_now(); - my $start_date = MediaWords::Util::SQL::increment_day( $now, -30 ); - my $end_date = MediaWords::Util::SQL::increment_day( $now, 30 ); - - my $topic = $db->create( - 'topics', - { - name => 'test topic', - description => 'test topic', - pattern => $TOPIC_PATTERN, - solr_seed_query => 'stories_id:0', - solr_seed_query_run => 't', - start_date => $start_date, - end_date => $end_date, - job_queue => 'mc', - max_stories => 100_000, - platform => 'web' - } - ); - - seed_unlinked_urls( $db, $topic, $sites ); - - # avoid race condition in TM::Mine - $db->create( 'tag_sets', { name => 'extractor_version' } ); - - return $topic; -} - -sub test_topic_stories($$$) -{ - my ( $db, $topic, $sites ) = @_; - - my $topic_stories = $db->query( <{ topics_id } )->hashes; -select cs.*, s.* - from topic_stories cs - join stories s on ( s.stories_id = cs.stories_id ) - where cs.topics_id = ? -SQL - - my $all_pages = []; - map { push( @{ $all_pages }, @{ $_->{ pages } } ) } @{ $sites }; - - DEBUG "ALL PAGES: " . scalar( @{ $all_pages } ); - - my $topic_pages = [ grep { $_->{ matches_topic } } @{ $all_pages } ]; - - DEBUG "TOPIC PAGES: " . scalar( @{ $topic_pages } ); - - my $topic_pages_lookup = {}; - map { $topic_pages_lookup->{ $_->{ url } } = $_ } @{ $topic_stories }; - - for my $topic_story ( @{ $topic_stories } ) - { - ok( $topic_pages_lookup->{ $topic_story->{ url } }, "topic story found for topic page '$topic_story->{ url }'" ); - - delete( $topic_pages_lookup->{ $topic_story->{ url } } ); - } - - is( scalar( keys( %{ $topic_pages_lookup } ) ), - 0, "missing topic story for topic pages: " . Dumper( values( %{ $topic_pages_lookup } ) ) ); - - # Wait for pending URLs to disappear - Readonly my $WAIT_PENDING_SECONDS => 10; - my $pending_count = 0; - for ( my $pending_retry = 0; $pending_retry <= $WAIT_PENDING_SECONDS; ++$pending_retry ) { - ( $pending_count ) = $db->query( "select count(*) from topic_fetch_urls where state ='pending'" )->flat; - if ( $pending_count > 0 ) { - WARN "Still $pending_count URLs are pending, will retry shortly"; - sleep( 1 ); - } else { - INFO "No more pending URLs, continuing"; - last; - } - } - is( $pending_count, 0, "After waiting $WAIT_PENDING_SECONDS some URLs are still in 'pending' state" ); - - my ( $dead_link_count ) = $db->query( "select count(*) from topic_fetch_urls where state ='request failed'" )->flat; - is( $dead_link_count, scalar( @{ $topic_pages } ), "dead link count" ); - - if ( $dead_link_count != scalar( @{ $topic_pages } ) ) - { - my $fetch_states = $db->query( "select count(*), state from topic_fetch_urls group by state" )->hashes(); - WARN( "fetch states: " . Dumper( $fetch_states ) ); - - my $fetch_errors = $db->query( "select * from topic_fetch_urls where state = 'python error'" )->hashes(); - WARN( "fetch errors: " . Dumper( $fetch_errors ) ); - } -} - -sub test_topic_links($$$) -{ - my ( $db, $topic, $sites ) = @_; - - my $cid = $topic->{ topics_id }; - - my $cl = $db->query( "select * from topic_links" )->hashes; - - TRACE "topic links: " . Dumper( $cl ); - - my $all_pages = []; - map { push( @{ $all_pages }, @{ $_->{ pages } } ) } @{ $sites }; - - for my $page ( @{ $all_pages } ) - { - next if ( !$page->{ matches_topic } ); - - for my $link ( @{ $page->{ links } } ) - { - next unless ( $link->{ matches_topic } ); - - my $topic_links = $db->query( <{ url }, $link->{ url }, $cid )->hashes; -select * - from topic_links cl - join stories s on ( cl.stories_id = s.stories_id ) - where - s.url = \$1 and - cl.url = \$2 and - cl.topics_id = \$3 -SQL - - is( scalar( @{ $topic_links } ), 1, "number of topic_links for $page->{ url } -> $link->{ url }" ); - } - } - - my $topic_spider_metric = $db->query( <{ topics_id } )->hash; -select sum( links_processed ) links_processed from topic_spider_metrics where topics_id = ? -SQL - - ok( $topic_spider_metric, "topic spider metrics exist" ); - ok( $topic_spider_metric->{ links_processed } > scalar( @{ $cl } ), "metrics links_processed greater than topic_links" ); -} - -# test that no errors exist in the topics or snapshots tables -sub test_for_errors($) -{ - my ( $db ) = @_; - - my $error_topics = $db->query( "select * from topics where state = 'error'" )->hashes; - - ok( scalar( @{ $error_topics } ) == 0, "topic errors: " . Dumper( $error_topics ) ); - - my $error_snapshots = $db->query( "select * from snapshots where state = 'error'" )->hashes; - - ok( scalar( @{ $error_snapshots } ) == 0, "snapshot errors: " . Dumper( $error_snapshots ) ); -} - -sub test_spider_results($$$) -{ - my ( $db, $topic, $sites ) = @_; - - test_topic_stories( $db, $topic, $sites ); - - test_topic_links( $db, $topic, $sites ); - - test_for_errors( $db ); -} - -sub get_site_structure($) -{ - my ( $sites ) = @_; - - my $meta_sites = []; - for my $site ( @{ $sites } ) - { - my $meta_site = { url => $site->{ url } }; - for my $page ( @{ $site->{ pages } } ) - { - my $meta_page = { url => $page->{ url }, matches_topic => $page->{ matches_topic } }; - map { push( @{ $meta_page->{ links } }, $_->{ url } ) } @{ $page->{ links } }; - - $meta_page->{ content } = $page->{ content } - if ( $page->{ matches_topic } && $page->{ matches_topic } ); - - push( @{ $meta_site->{ pages } }, $meta_page ); - } - - push( @{ $meta_sites }, $meta_site ); - } - - return $meta_sites; -} - -sub test_spider($) -{ - my ( $db ) = @_; - - # we pseudo-randomly generate test data, but we want repeatable tests - srand( 3 ); - - MediaWords::Util::Mail::enable_test_mode(); - - my $sites = get_test_sites(); - - TRACE "SITE STRUCTURE " . Dumper( get_site_structure( $sites ) ); - - add_site_media( $db, $sites ); - - my $hash_servers = start_hash_servers( $sites ); - - test_pages( $sites ); - - my $topic = create_topic( $db, $sites ); - - my $mine_args = { - topics_id => $topic->{ topics_id }, - skip_post_processing => 1, # - cache_broken_downloads => 0, # - import_only => 0, # - skip_outgoing_foreign_rss_links => 0, # - test_mode => 1 - }; - - MediaWords::TM::Mine::mine_topic( $db, $topic, $mine_args ); - - test_spider_results( $db, $topic, $sites ); - - map { $_->stop } @{ $hash_servers }; -} - -sub main -{ - my $db = MediaWords::DB::connect_to_db(); - - test_spider( $db ); - - done_testing(); -} - -main(); diff --git a/apps/topics-mine/tests/python/test_add_new_links.py b/apps/topics-mine/tests/python/test_add_new_links.py new file mode 100644 index 0000000000..41c036290a --- /dev/null +++ b/apps/topics-mine/tests/python/test_add_new_links.py @@ -0,0 +1,34 @@ +import mediawords.db +from mediawords.test.db.create import create_test_topic, create_test_topic_stories +import topics_mine.mine + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def test_fetch_links(): + db = mediawords.db.connect_to_db() + + num_urls = 100 + + topic = create_test_topic(db, 'foo') + create_test_topic_stories(db, topic, 1, num_urls); + + # add a bunch of urls with bad urls. the fetch-link job will fail with a python error + # but that's fine becase all we are testing here is that each url makes it into the job pool + db.query("delete from topic_links") + links = db.query( + """ + insert into topic_links (topics_id, stories_id, url) + select topics_id, stories_id, 'U ' || stories_id::text from topic_stories + returning * + """).hashes() + + topics_mine.mine.ADD_NEW_LINKS_CHUNK_SIZE = int(num_urls / 2) - 1 + + topics_mine.mine.add_new_links(db, topic, 1, links, None) + + count_processed_tfus = db.query("select count(*) from topic_fetch_urls where state = 'request failed'").flat()[0] + assert count_processed_tfus == num_urls + + count_spidered_links = db.query("select count(*) from topic_links where link_spidered").flat()[0] + assert count_spidered_links == num_urls diff --git a/apps/topics-mine/tests/python/test_check_error_rate.py b/apps/topics-mine/tests/python/test_check_error_rate.py new file mode 100644 index 0000000000..c8cbc4fe94 --- /dev/null +++ b/apps/topics-mine/tests/python/test_check_error_rate.py @@ -0,0 +1,60 @@ +import unittest + +import mediawords.db +from mediawords.test.db.create import create_test_topic, create_test_topic_stories +from topics_mine.mine import check_job_error_rate, McTopicMineError + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +class TestCheckJobErrorRate(unittest.TestCase): + + def test_check_error_Rate(self): + db = mediawords.db.connect_to_db() + + topic = create_test_topic(db, 'foo') + + # first call should not raise an error because there are not topic_fetch_urls + check_job_error_rate(db, topic) + + num_tfus = 100 + + for i in range(num_tfus): + tfu = { + 'topics_id': topic['topics_id'], + 'url': str(i), + 'state': 'pending' + } + db.create('topic_fetch_urls', tfu) + + # still should not return an error with all pending tfus + check_job_error_rate(db, topic) + + db.query("update topic_fetch_urls set state = 'python error' where url = '1'") + + # only one error, so still no exception + check_job_error_rate(db, topic) + + db.query("update topic_fetch_urls set state = 'python error'") + + # now with all errors we should get an exception + self.assertRaises(McTopicMineError, check_job_error_rate, db, topic) + + db.query("update topic_fetch_urls set state = 'pending'") + + num_stories = 100 + + create_test_topic_stories(db, topic, num_stories) + + # should not return an error with no errors in topic_stories + check_job_error_rate(db, topic) + + db.query("update topic_stories set link_mine_error = 'test error' where stories_id = 1") + + # still should not throw an exception with only one error + check_job_error_rate(db, topic) + + db.query("update topic_stories set link_mine_error = 'test error'") + + # now throw an exception since there are too many errors + self.assertRaises(McTopicMineError, check_job_error_rate, db, topic) diff --git a/apps/topics-mine/tests/python/test_fetch_links.py b/apps/topics-mine/tests/python/test_fetch_links.py new file mode 100644 index 0000000000..e87dfb8aaa --- /dev/null +++ b/apps/topics-mine/tests/python/test_fetch_links.py @@ -0,0 +1,23 @@ +import mediawords.db +from mediawords.test.db.create import create_test_topic +from topics_mine.mine import fetch_links + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def test_fetch_links(): + db = mediawords.db.connect_to_db() + + topic = create_test_topic(db, 'foo') + + num_urls = 100 + + # add a bunch of urls with bad urls. the fetch-link job will fail with a python error + # but that's fine becase all we are testing here is that each url makes it into the job pool + + links = [{'url': f"INVALID URL {i}"} for i in range(num_urls)] + + fetch_links(db, topic, links) + + count_processed_tfus = db.query("select count(*) from topic_fetch_urls where state = 'request failed'").flat()[0] + assert count_processed_tfus == num_urls diff --git a/apps/topics-mine/tests/python/test_fetch_social_media_data.py b/apps/topics-mine/tests/python/test_fetch_social_media_data.py new file mode 100644 index 0000000000..cec7c7f437 --- /dev/null +++ b/apps/topics-mine/tests/python/test_fetch_social_media_data.py @@ -0,0 +1,25 @@ +import mediawords.db +from mediawords.test.db.create import create_test_topic, create_test_topic_stories +from topics_mine.mine import fetch_social_media_data + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def test_fetch_social_media_data(): + db = mediawords.db.connect_to_db() + + num_stories = 20 + + topic = create_test_topic(db, 'foo') + create_test_topic_stories(db, topic, 1, num_stories) + + db.query("update stories set url = stories_id::text") + + fetch_social_media_data(db, topic) + + num_fetched_stories = db.query( + "select count(*) from story_statistics where facebook_api_error like '%URL is not HTTP%'").flat()[0] + + log.warning(db.query("select facebook_api_error from story_statistics").flat()) + + assert num_fetched_stories == num_stories diff --git a/apps/topics-mine/tests/python/test_fetch_twitter_urls.py b/apps/topics-mine/tests/python/test_fetch_twitter_urls.py new file mode 100644 index 0000000000..888cd12836 --- /dev/null +++ b/apps/topics-mine/tests/python/test_fetch_twitter_urls.py @@ -0,0 +1,42 @@ +import mediawords.db +from mediawords.test.db.create import create_test_topic +from topics_mine.mine import _fetch_twitter_urls + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def test_fetch_twitter_urls(): + db = mediawords.db.connect_to_db() + + topic = create_test_topic(db, 'foo') + + num_urls = 100 + + # add a bunch of urls with non-twitter urls. the fetch-twitter-urls job will fail with a python error + # when the urls cannot be parsed for twitter statuses, but that's fine becase all we are testing here + # is that each url makes it into the fetch_twitter_url job pool + + tfus = [] + for i in range(num_urls): + tfu = { + 'topics_id': topic['topics_id'], + 'url': 'http://not.a.twitter.url', + 'state': 'tweet pending' + } + tfu = db.create("topic_fetch_urls", tfu) + + tfus.append(tfu) + + tfu_ids = [tfu['topic_fetch_urls_id'] for tfu in tfus] + + _fetch_twitter_urls(db, topic, tfu_ids) + + # if every url passed to the queue gets tagged with a url error, that means they all got processed + # by the fetch-twitter-urls pool + count_processed_tfus = db.query( + """ + select count(*) from topic_fetch_urls + where state = 'python error' and message like '%McFetchTwitterUrlsDataException%' + """).flat()[0] + + assert count_processed_tfus == num_urls diff --git a/apps/topics-mine/tests/python/test_generate_topic_links.py b/apps/topics-mine/tests/python/test_generate_topic_links.py new file mode 100644 index 0000000000..06a7f11975 --- /dev/null +++ b/apps/topics-mine/tests/python/test_generate_topic_links.py @@ -0,0 +1,29 @@ +import mediawords.db +from mediawords.test.db.create import create_test_topic, create_test_topic_stories +from topics_mine.mine import generate_topic_links + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def test_generate_topic_links(): + db = mediawords.db.connect_to_db() + + num_stories = 100 + + topic = create_test_topic(db, 'foo') + create_test_topic_stories(db, topic, 1, num_stories) + + stories = db.query("select * from stories").hashes() + + num_topic_stories = db.query("select count(*) from topic_stories").flat()[0] + assert num_topic_stories == num_stories + + db.query("update stories set description = 'http://foo.com/' || stories_id::text") + + generate_topic_links(db, topic, stories) + + num_unmined_stories = db.query("select count(*) from topic_stories where not link_mined").flat()[0] + assert num_unmined_stories == 0 + + num_mined_links = db.query("select count(*) from topic_links").flat()[0] + assert num_mined_links == num_stories diff --git a/apps/topics-mine/tests/python/test_import_month_with_respider_date.py b/apps/topics-mine/tests/python/test_import_month_with_respider_date.py new file mode 100644 index 0000000000..2a82fcafa5 --- /dev/null +++ b/apps/topics-mine/tests/python/test_import_month_with_respider_date.py @@ -0,0 +1,38 @@ +from topics_mine.mine import _import_month_within_respider_date + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def test_import_month_with_respider_date(): + topic = { + 'start_date': '2019-01-01', + 'end_date': '2019-06-01', + 'respider_stories': 'f', + 'respider_start_date': None, + 'respider_end_date': None} + + # if none of the respider setting are correct, we should always return true + assert _import_month_within_respider_date(topic, 0) + assert _import_month_within_respider_date(topic, 1) + assert _import_month_within_respider_date(topic, 100) + + # if respider_stories is true but neither respider date is set, always return true + topic['respider_stories'] = 1 + assert _import_month_within_respider_date(topic, 0) + assert _import_month_within_respider_date(topic, 1) + assert _import_month_within_respider_date(topic, 100) + + # should only import the dates after the respider end date + topic['respider_end_date'] = '2019-05-01' + assert not _import_month_within_respider_date(topic, 0) + assert not _import_month_within_respider_date(topic, 3) + assert _import_month_within_respider_date(topic, 4) + + # make sure we capture the whole previous month if the end date is within a month + topic['respider_end_date'] = '2019-04-02' + assert _import_month_within_respider_date(topic, 3) + + # should only import the dates before the repsider start date + topic['respider_start_date'] = '2019-02-01' + assert _import_month_within_respider_date(topic, 0) + assert not _import_month_within_respider_date(topic, 1) diff --git a/apps/topics-mine/tests/python/test_import_seed_urls.py b/apps/topics-mine/tests/python/test_import_seed_urls.py new file mode 100644 index 0000000000..a62d74c100 --- /dev/null +++ b/apps/topics-mine/tests/python/test_import_seed_urls.py @@ -0,0 +1,29 @@ +import mediawords.db +from mediawords.test.db.create import create_test_topic +import topics_mine.mine + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def test_fetch_links(): + db = mediawords.db.connect_to_db() + + num_urls = 100 + + topic = create_test_topic(db, 'foo') + + for i in range(num_urls): + tsu = { + 'topics_id': topic['topics_id'], + 'processed': 'false', + 'url': f'INVALID URL {i}'} + db.create('topic_seed_urls', tsu) + + topics_mine.mine.ADD_NEW_LINKS_CHUNK_SIZE = int(num_urls / 2) - 1 + topics_mine.mine.import_seed_urls(db, topic, None) + + count_processed_tfus = db.query("select count(*) from topic_fetch_urls where state = 'request failed'").flat()[0] + assert count_processed_tfus == num_urls + + count_processed_urls = db.query("select count(*) from topic_seed_urls where processed").flat()[0] + assert count_processed_urls == num_urls diff --git a/apps/topics-mine/tests/python/test_import_solr_seed_query.py b/apps/topics-mine/tests/python/test_import_solr_seed_query.py new file mode 100644 index 0000000000..dd0e5a7f2e --- /dev/null +++ b/apps/topics-mine/tests/python/test_import_solr_seed_query.py @@ -0,0 +1,29 @@ +import mediawords.db +from mediawords.test.db.create import create_test_topic +from mediawords.test.solr import create_test_story_stack_for_indexing, setup_test_index +import topics_mine.mine +import topics_mine.test + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def test_import_solr_seed_query(): + db = mediawords.db.connect_to_db() + num_stories = 200 + + topic = topics_mine.test.create_topic_for_import(db=db, num_stories=num_stories) + + topics_mine.mine.import_solr_seed_query(db, topic) + + date_stories = db.query( + "select * from stories where publish_date <= %(a)s", + {'a': topic['end_date']}).hashes() + + date_stories_urls = [s['url'] for s in date_stories] + + count_topic_seed_urls = db.query( + "select count(distinct url) from topic_seed_urls where url = any(%(a)s)", + {'a': date_stories_urls}).flat()[0] + + assert len(date_stories) > 0, f"offset {i}" + assert len(date_stories) == count_topic_seed_urls, f"topic seed urls for month offset {i}" diff --git a/apps/topics-mine/tests/python/test_import_solr_seed_query_month.py b/apps/topics-mine/tests/python/test_import_solr_seed_query_month.py new file mode 100644 index 0000000000..bd88388022 --- /dev/null +++ b/apps/topics-mine/tests/python/test_import_solr_seed_query_month.py @@ -0,0 +1,40 @@ +import mediawords.db +from mediawords.test.db.create import create_test_topic +from mediawords.test.solr import create_test_story_stack_for_indexing, setup_test_index +import topics_mine.mine +import topics_mine.test + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def test_import_solr_seed_query_month(): + db = mediawords.db.connect_to_db() + num_stories = 200 + + topic = topics_mine.test.create_topic_for_import(db=db, num_stories=num_stories) + + i = 0 + while topics_mine.mine.import_solr_seed_query_month(db, topic, i): + date_stories = db.query( + """ + select * from stories + where + publish_date >= %(a)s::timestamp + ((%(b)s || ' months')::interval) and + publish_date <= %(a)s::timestamp + ((%(c)s || ' months')::interval) and + publish_date <= %(d)s + """, + {'a': topic['start_date'], 'b': i, 'c': i + 1, 'd': topic['end_date']}).hashes() + + date_stories_urls = [s['url'] for s in date_stories] + + count_topic_seed_urls = db.query( + "select count(distinct url) from topic_seed_urls where url = any(%(a)s)", + {'a': date_stories_urls}).flat()[0] + + assert len(date_stories) > 0, f"offset {i}" + assert len(date_stories) == count_topic_seed_urls, f"topic seed urls for month offset {i}" + + i += 1 + + + diff --git a/apps/topics-mine/tests/python/test_import_urls_from_seed_queries.py b/apps/topics-mine/tests/python/test_import_urls_from_seed_queries.py new file mode 100644 index 0000000000..694a11c4a5 --- /dev/null +++ b/apps/topics-mine/tests/python/test_import_urls_from_seed_queries.py @@ -0,0 +1,43 @@ +import csv +import io + +import mediawords.db +from mediawords.test.db.create import create_test_topic, create_test_topic_stories +import topics_mine.mine + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def test_import_urls_from_seed_queries(): + db = mediawords.db.connect_to_db() + + num_stories = 100 + + topic = create_test_topic(db, 'foo') + topic['pattern'] = '.*' + topic = db.update_by_id('topics', topic['topics_id'], topic) + + date = topic['start_date'] + + posts = [{'author': i, 'publish_date': date, 'content': f'http://u.u/{i}'} for i in range(num_stories)] + + csv_io = io.StringIO() + csv_writer = csv.DictWriter(csv_io, fieldnames=posts[0].keys()) + csv_writer.writeheader() + [csv_writer.writerow(p) for p in posts] + + seed_csv = csv_io.getvalue() + + tsq = { + 'topics_id': topic['topics_id'], + 'source': 'csv', + 'platform': 'generic_post', + 'query': seed_csv + } + tsq = db.create('topic_seed_queries', tsq) + + topics_mine.mine.import_urls_from_seed_queries(db, topic, None) + + num_tsus = db.query("select count(distinct url) from topic_seed_urls").flat()[0] + + assert num_tsus == num_stories diff --git a/apps/topics-mine/tests/python/test_mine.py b/apps/topics-mine/tests/python/test_mine.py new file mode 100644 index 0000000000..280e21e04e --- /dev/null +++ b/apps/topics-mine/tests/python/test_mine.py @@ -0,0 +1,407 @@ +import random +import socket +import time + +import lorem + +import mediawords.db +import mediawords.test.hash_server +import mediawords.util.sql +from mediawords.util.web.user_agent import UserAgent +from mediawords.util.web.user_agent.request.request import Request + +import topics_mine.mine + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +BASE_PORT = 8890 + +NUM_SITES = 5 +NUM_PAGES_PER_SITE = 10 +NUM_LINKS_PER_PAGE = 2 + +TOPIC_PATTERN = 'FOOBARBAZ' + +def get_html_link(page): + return page['url'] + +def lorem_sentences(n: int) -> str: + return ' '.join([lorem.sentence() for i in range(n)]) + +def generate_content_for_site(site): + body = lorem_sentences(5) + + return f""" + + + site['title'] + + +

+ body +

+ + + """ + +def randindex(n): + """generate a random int >= 0 and < n.""" + return random.randint(0, n - 1) + +def generate_content_for_page(site, page): + num_links = len(page['links']) + num_paragraphs = int(randindex(10) + 3) + num_links + + paragraphs = [] + + for i in range(num_paragraphs): + text = lorem_sentences(5) + if i < num_links: + html_link = get_html_link(page['links'][i]) + text += f" {html_link}" + + paragraphs.append(text) + + if randindex(2) < 1: + paragraphs.append(lorem.sentence() + f" {TOPIC_PATTERN}") + page['matches_topic'] = 1 + + dead_link_text = lorem_sentences(5) + dead_link_text += f" dead link" + + paragraphs.append(dead_link_text) + + body = "\n\n".join([f"

\n{p}\n

" for p in paragraphs]) + + return f""" + + + {page['title']} + + + {body} + + + """ + +def generate_content_for_sites(sites): + for site in sites: + site['content'] = generate_content_for_site(site) + + for p in site['pages']: + p['content'] = generate_content_for_page(site, p) + +def get_test_sites(): + """ generate test set of sites""" + sites = [] + pages = [] + + # base_port = BASE_PORT + int(rand( 200) ) + base_port = BASE_PORT + + for site_id in range(NUM_SITES): + port = base_port + site_id + # other containers will access this host to we have to set the actual hostname instead of just localhost + host = socket.gethostname() + + site = { + 'port': port, + 'id': site_id, + 'url': f"http://{host}:{port}/", + 'title': f"site {site_id}", + 'pages': [] + } + + num_pages = int(randindex(NUM_PAGES_PER_SITE)) + 1 + for page_id in range(num_pages): + date = mediawords.util.sql.get_sql_date_from_epoch(time.time() - (randindex(365) * 86400)) + + path = f"page-{page_id}" + + page = { + 'id': page_id, + 'path': f"/{path}", + 'url': f"{site['url']}{path}", + 'title': f"page {page_id}", + 'pubish_date': date, + 'links': [], + 'matches_topic': False + } + + pages.append(page) + site['pages'].append(page) + + sites.append(site) + + for page in pages: + num_links = int(randindex(NUM_LINKS_PER_PAGE)) + for link_id in range(num_links): + linked_page_id = int(randindex(len(pages))) + linked_page = pages[linked_page_id] + + if not mediawords.util.url.urls_are_equal(page['url'], linked_page['url']): + page['links'].append(linked_page) + + generate_content_for_sites(sites) + + return sites + +def add_site_media(db, sites): + """add a medium for each site so that the spider can find the medium that corresponds to each url""" + for s in sites: + s['medium'] = db.create('media', {'url': s['url'], 'name': s['title']}) + +def start_hash_servers(sites): + hash_servers = [] + + for site in sites: + site_hash = {} + site_hash['/'] = site['content'] + + for p in site['pages']: + site_hash[p['path']] = p['content'] + + hs = mediawords.test.hash_server.HashServer(port=site['port'], pages=site_hash) + + log.debug(f"starting hash server {site['id']}") + + hs.start() + + hash_servers.append(hs) + + # wait for the hash servers to start + time.sleep(1) + + return hash_servers + +def validate_page(label, url, expected_content): + + log.debug(f"test page: {label} {url}") + + ua = UserAgent() + request = Request('get', url) + response = ua.request(request) + + assert response.is_success(), f"request success: {label} {url}" + + got_content = response.decoded_content() + + log.debug("got content") + + assert got_content == expected_content + +def validate_pages(sites): + for site in sites: + log.debug(f"testing pages for site {site['id']}") + validate_page(f"site {site['id']}", site['url'], site['content']) + + [validate_page(f"page {site['id']} p{['id']}", p['url'], p['content']) for p in site['pages']] + +def seed_unlinked_urls(db, topic, sites): + all_pages = [] + [all_pages.extend(s['pages']) for s in sites] + + # do not seed urls that are linked directly from a page that is a topic match. + # this forces the test to succesfully discover those pages through spidering. + non_seeded_url_lookup = {} + for page in all_pages: + if page['matches_topic']: + for l in page['links']: + non_seeded_url_lookup[l['url']] = 1 + + seed_pages = [] + for page in all_pages: + if non_seeded_url_lookup.get(page['url'], False): + log.debug(f"non seeded url: {page['url']}") + else: + log.debug(f"seed url: {page['url']}") + seed_pages.append(page) + + [db.create('topic_seed_urls', {'topics_id': topic['topics_id'], 'url': p['url']}) for p in seed_pages] + +def create_topic(db, sites): + now = mediawords.util.sql.sql_now() + start_date = mediawords.util.sql.increment_day(now, -30) + end_date = mediawords.util.sql.increment_day(now, 30) + + topic = { + 'name': 'test topic', + 'description': 'test topic', + 'pattern': TOPIC_PATTERN, + 'solr_seed_query': 'stories_id:0', + 'solr_seed_query_run': 't', + 'start_date': start_date, + 'end_date': end_date, + 'job_queue': 'mc', + 'max_stories': 100_000, + 'platform': 'web' + } + topic = db.create('topics', topic) + + seed_unlinked_urls(db, topic, sites) + + # avoid race condition in TM::Mine + db.create('tag_sets', {'name': 'extractor_version'}) + + return topic + +def validate_topic_stories(db, topic, sites): + topic_stories = db.query( + """ + select cs.*, s.* + from topic_stories cs + join stories s on (s.stories_id = cs.stories_id) + where cs.topics_id = %(a)s + """, + {'a': topic['topics_id']}).hashes() + + all_pages = [] + [all_pages.extend(s['pages']) for s in sites] + + log.info(f"ALL PAGES: {len(all_pages)}") + + topic_pages = [p for p in all_pages if p['matches_topic']] + + log.info(f"TOPIC PAGES: {len(topic_pages)}") + + topic_pages_lookup = {s['url']: s for s in topic_stories} + + log.info(f"TOPIC PAGES LOOKUP: {len(topic_pages_lookup)}") + + for topic_story in topic_stories: + assert topic_pages_lookup.get(topic_story['url'], False) + del topic_pages_lookup[topic_story['url']] + + assert len(topic_pages_lookup) == 0 + + # Wait for pending URLs to disappear + WAIT_PENDING_SECONDS = 10 + pending_count = 0 + pending_retry = 0 + while pending_retry <= WAIT_PENDING_SECONDS: + pending_count = db.query("select count(*) from topic_fetch_urls where state ='pending'").flat()[0] + if pending_count > 0: + log.warning("Still pending_count URLs are pending, will retry shortly") + time.sleep(1) + else: + log.info("No more pending URLs, continuing") + break + + pending_retry += 1 + + assert pending_count == 0, f"After waiting {WAIT_PENDING_SECONDS} some URLs are still in 'pending' state" + + dead_link_count = db.query( "select count(*) from topic_fetch_urls where state ='request failed'").flat()[0] + dead_pages_count = db.query("select count(*) from topic_fetch_urls where url like '%dead%'").flat()[0] + + if dead_link_count != dead_pages_count: + fetch_states = db.query("select count(*), state from topic_fetch_urls group by state" ).hashes() + log.info(f"fetch states: {fetch_states}") + + fetch_errors = db.query("select * from topic_fetch_urls where state = 'python error'").hashes() + log.info(f"fetch errors: {fetch_errors}") + + assert dead_link_count == dead_pages_count, "dead link count" + +def validate_topic_links(db, topic, sites): + cid = topic['topics_id'] + + topic_links = db.query("select * from topic_links").hashes() + + log.info(f"TOPIC LINKS: {len(topic_links)}") + + all_pages = [] + [all_pages.extend(s['pages']) for s in sites] + + for page in all_pages: + if not page['matches_topic']: + continue + + for link in page['links']: + if not link['matches_topic']: + continue + + topic_links = db.query( + """ + select * + from topic_links cl + join stories s on (cl.stories_id = s.stories_id) + where + s.url = %(a)s and + cl.url = %(b)s and + cl.topics_id = %(c)s + """, + {'a': page['url'], 'b': link['url'], 'c': cid}).hashes() + + assert len(topic_links) == 1, f"number of topic_links for {page['url']} -> {link['url']}" + + topic_spider_metric = db.query( + "select sum(links_processed) links_processed from topic_spider_metrics where topics_id = %(a)s", + {'a': cid}).hash() + + assert topic_spider_metric,"topic spider metrics exist" + assert topic_spider_metric['links_processed'] > len(topic_links), "metrics links_processed greater than topic_links" + +def validate_for_errors(db): + """ test that no errors exist in the topics or snapshots tables""" + error_topics = db.query("select * from topics where state = 'error'").hashes() + + assert len( error_topics) == 0, f"topic errors: {error_topics}" + + error_snapshots = db.query("select * from snapshots where state = 'error'").hashes() + + assert len( error_snapshots) == 0, f"snapshot errors:{error_snapshots}" + +def validate_spider_results(db, topic, sites): + validate_topic_stories(db, topic, sites) + validate_topic_links(db, topic, sites) + validate_for_errors(db) + +def get_site_structure(sites): + meta_sites = [] + for site in sites: + meta_site = {'url': site['url'], 'pages': []} + for page in site['pages']: + meta_page = {'url': page['url'], 'matches_topic': page['matches_topic'], 'links': []} + [meta_page['links'].append(l['url']) for l in page['links']] + + if page['matches_topic'] and meta_page['matches_topic']: + meta_page['content'] = page['content'] + + meta_site['pages'].append(meta_page) + + meta_sites.append(meta_site) + + return meta_sites + +def test_mine(): + # we pseudo-randomly generate test data, but we want repeatable tests + random.seed(3) + + db = mediawords.db.connect_to_db() + + mediawords.util.mail.enable_test_mode() + + sites = get_test_sites() + + log.debug(f"SITE STRUCTURE {get_site_structure(sites)}") + + add_site_media(db, sites) + + hash_servers = start_hash_servers(sites) + + validate_pages(sites) + + topic = create_topic(db, sites) + + topics_mine.mine.DOMAIN_TIMEOUT = 0 + + topics_mine.mine.mine_topic( + db=db, + topic=topic, + skip_post_processing=True) + + validate_spider_results(db, topic, sites) + + [hs.stop for hs in hash_servers] diff --git a/apps/topics-mine/tests/python/test_mine_topic_stories.py b/apps/topics-mine/tests/python/test_mine_topic_stories.py new file mode 100644 index 0000000000..76a686ac5f --- /dev/null +++ b/apps/topics-mine/tests/python/test_mine_topic_stories.py @@ -0,0 +1,21 @@ +import mediawords.db +from mediawords.test.db.create import create_test_topic, create_test_topic_stories +import topics_mine.mine + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def test_fetch_links(): + db = mediawords.db.connect_to_db() + + num_urls = 100 + + topic = create_test_topic(db, 'foo') + create_test_topic_stories(db, topic, 1, num_urls); + + topics_mine.mine.EXTRACT_STORY_LINKS_CHUNK_SIZE = int(num_urls / 2) - 1 + + topics_mine.mine.mine_topic_stories(db, topic) + + count_spidered_stories = db.query("select count(*) from topic_stories where link_mined").flat()[0] + assert count_spidered_stories == num_urls diff --git a/apps/topics-mine/tests/python/test_respider.py b/apps/topics-mine/tests/python/test_respider.py new file mode 100644 index 0000000000..f58725a71a --- /dev/null +++ b/apps/topics-mine/tests/python/test_respider.py @@ -0,0 +1,105 @@ +import mediawords.db +from mediawords.test.db.create import create_test_topic, create_test_topic_stories +import mediawords.util.sql +import topics_mine.mine + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def test_fetch_social_media_data(): + db = mediawords.db.connect_to_db() + + topic = create_test_topic(db, 'foo') + + topic['start_date'] = '2017-01-01' + topic['end_date'] = '2018-01-01' + + topic = db.update_by_id( + 'topics', + topic['topics_id'], + { 'max_stories': 0, 'start_date': '2017-01-01', 'end_date': '2018-01-01' } + ) + + num_stories = 101 + create_test_topic_stories(db, topic, 1, num_stories) + + # no respidering without respider_stories + db.query("update topic_stories set link_mined = 't'") + + topics_mine.mine.set_stories_respidering(db, topic, None) + + got_num_respider_stories = db.query( "select count(*) from topic_stories where not link_mined" ).flat()[0] + assert got_num_respider_stories == 0 + + # respider everything with respider_stories but no dates + topic['respider_stories'] = 1 + + db.query("update topic_stories set link_mined = 't'") + + topics_mine.mine.set_stories_respidering(db, topic, None) + + got_num_respider_stories = db.query( "select count(*) from topic_stories where not link_mined" ).flat()[0] + assert got_num_respider_stories == num_stories + + # respider stories within the range of changed dates + topic_update = { + 'respider_stories': 't', + 'respider_end_date': topic['end_date'], + 'respider_start_date': topic['start_date'], + 'end_date': '2019-01-01', + 'start_date': '2016-01-01' + } + + topic = db.update_by_id('topics', topic['topics_id'], topic_update) + + db.query("update topic_stories set link_mined = 't'") + + num_date_changes = 10 + db.query("update stories set publish_date = '2017-06-01'") + db.query( + """ + update stories set publish_date = %(a)s where stories_id in + (select stories_id from stories order by stories_id limit %(b)s) + """, + {'a': '2018-06-01', 'b': num_date_changes}) + db.query( + """ + update stories set publish_date = %(a)s where stories_id in + (select stories_id from stories order by stories_id desc limit %(b)s) + """, + {'a': '2016-06-01', 'b': num_date_changes}) + + snapshot = { + 'topics_id': topic['topics_id'], + 'snapshot_date': mediawords.util.sql.sql_now(), + 'start_date': topic['start_date'], + 'end_date': topic['end_date']} + + snapshot = db.create('snapshots', snapshot) + + timespan_dates = [['2017-01-01', '2017-01-31'], ['2017-12-20', '2018-01-20'], ['2016-12-20', '2017-01-20']] + + for dates in timespan_dates: + (start_date, end_date) = dates + timespan = { + 'snapshots_id': snapshot['snapshots_id'], + 'start_date': start_date, + 'end_date': end_date, + 'period': 'monthly', + 'story_count': 0, + 'story_link_count': 0, + 'medium_count': 0, + 'medium_link_count': 0, + 'post_count': 0} + + timespan = db.create('timespans', timespan) + + topics_mine.mine.set_stories_respidering(db, topic, snapshot['snapshots_id']) + + got_num_respider_stories = db.query("select count(*) from topic_stories where not link_mined").flat()[0] + assert got_num_respider_stories == 2 * num_date_changes + + got_num_archived_timespans = db.query( + "select count(*) from timespans where archive_snapshots_id = %(a)s", + {'a': snapshot['snapshots_id']}).flat()[0] + assert got_num_archived_timespans == 2 diff --git a/apps/topics-mine/tests/python/test_spider_new_links.py b/apps/topics-mine/tests/python/test_spider_new_links.py new file mode 100644 index 0000000000..88657de29b --- /dev/null +++ b/apps/topics-mine/tests/python/test_spider_new_links.py @@ -0,0 +1,32 @@ +import mediawords.db +from mediawords.test.db.create import create_test_topic, create_test_topic_stories +import topics_mine.mine + +from mediawords.util.log import create_logger +log = create_logger(__name__) + +def test_fetch_links(): + db = mediawords.db.connect_to_db() + + num_urls = 10 + + topic = create_test_topic(db, 'foo') + create_test_topic_stories(db, topic, 1, num_urls); + + # add a bunch of urls with bad urls. the fetch-link job will fail with a python error + # but that's fine becase all we are testing here is that each url makes it into the job pool + db.query("delete from topic_links") + links = db.query( + """ + insert into topic_links (topics_id, stories_id, url) + select topics_id, stories_id, 'U ' || stories_id::text from topic_stories + returning * + """).hashes() + + topics_mine.mine.spider_new_links(db, topic, 1, None) + + count_processed_tfus = db.query("select count(*) from topic_fetch_urls where state = 'request failed'").flat()[0] + assert count_processed_tfus == num_urls + + count_spidered_links = db.query("select count(*) from topic_links where link_spidered").flat()[0] + assert count_spidered_links == num_urls diff --git a/dev/run_test.py b/dev/run_test.py index 99703d9f10..72b1c92525 100755 --- a/dev/run_test.py +++ b/dev/run_test.py @@ -68,7 +68,7 @@ def docker_test_commands(all_apps_dir: str, test_file: str, verbose: bool) -> Li if test_file.endswith('.py'): test_command = [ - 'py.test', '-s', '-vv', + 'py.test', '-s', '-vv', # Disable cache because it won't be preserved '-p', 'no:cacheprovider',