Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ services:
- mongodb_container
logging:
options:
max-size: 100m
max-size: "100m"
ports:
- 5000
volumes:
Expand All @@ -24,7 +24,7 @@ services:
restart: always
logging:
options:
max-size: 100m ## limit the size of logging files during scraping
max-size: "100m" ## limit the size of logging files during scraping
ports:
- "27000:27017" ## map the port 27000 to mongodb's default port
environment:
Expand All @@ -37,11 +37,17 @@ services:
ports:
- "7480:6379"
# command: rq worker crawling-tasks
logging:
options:
max-size: "100m"
redis_worker:
build: scrapy/schools
depends_on:
- mongodb_container
- redis
logging:
options:
max-size: "100m"
command: rq worker crawling-tasks --path /code/schools/
volumes:
- crawler_api:/code/schools/spiders
Expand Down
302 changes: 151 additions & 151 deletions schools/schools/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,112 +32,112 @@
import requests
import gridfs

class MongoDBImagesPipeline(object):
# class MongoDBImagesPipeline(object):

def __init__(self, MONGO_URI, MONGODB_DB, MONGODB_COLLECTION_IMAGES, MONGO_USERNAME='admin', MONGO_PASSWORD=''):
self.MONGO_URI = MONGO_URI
self.MONGODB_DB = MONGODB_DB
self.MONGODB_COLLECTION_IMAGES = MONGODB_COLLECTION_IMAGES
self.MONGO_USERNAME = MONGO_USERNAME
self.MONGO_PASSWORD = MONGO_PASSWORD
# def __init__(self, MONGO_URI, MONGODB_DB, MONGODB_COLLECTION_IMAGES, MONGO_USERNAME='admin', MONGO_PASSWORD=''):
# self.MONGO_URI = MONGO_URI
# self.MONGODB_DB = MONGODB_DB
# self.MONGODB_COLLECTION_IMAGES = MONGODB_COLLECTION_IMAGES
# self.MONGO_USERNAME = MONGO_USERNAME
# self.MONGO_PASSWORD = MONGO_PASSWORD

@classmethod
def from_crawler(cls, crawler):
# pull in information from settings.py
return cls(
MONGO_URI=crawler.settings.get('MONGO_URI'),
MONGODB_DB=crawler.settings.get('MONGODB_DB'),
MONGODB_COLLECTION_IMAGES=crawler.settings.get('MONGODB_COLLECTION_IMAGES'),
MONGO_USERNAME = crawler.settings.get('MONGO_USERNAME'),
MONGO_PASSWORD = crawler.settings.get('MONGO_PASSWORD')
)
# @classmethod
# def from_crawler(cls, crawler):
# # pull in information from settings.py
# return cls(
# MONGO_URI=crawler.settings.get('MONGO_URI'),
# MONGODB_DB=crawler.settings.get('MONGODB_DB'),
# MONGODB_COLLECTION_IMAGES=crawler.settings.get('MONGODB_COLLECTION_IMAGES'),
# MONGO_USERNAME = crawler.settings.get('MONGO_USERNAME'),
# MONGO_PASSWORD = crawler.settings.get('MONGO_PASSWORD')
# )


def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
connection = pymongo.MongoClient(
self.MONGO_URI,
username=self.MONGO_USERNAME,
password=self.MONGO_PASSWORD
)
self.db = connection[self.MONGODB_DB]
print("CONNECTED TO MONGO DB")
# def process_item(self, item, spider):
# valid = True
# for data in item:
# if not data:
# valid = False
# raise DropItem("Missing {0}!".format(data))
# if valid:
# connection = pymongo.MongoClient(
# self.MONGO_URI,
# username=self.MONGO_USERNAME,
# password=self.MONGO_PASSWORD
# )
# self.db = connection[self.MONGODB_DB]
# print("CONNECTED TO MONGO DB")

#self.collection = self.db[self.MONGODB_COLLECTION_IMAGES]
self.grid_fs = gridfs.GridFS(self.db, collection = self.MONGODB_COLLECTION_IMAGES)
# #self.collection = self.db[self.MONGODB_COLLECTION_IMAGES]
# self.grid_fs = gridfs.GridFS(self.db, collection = self.MONGODB_COLLECTION_IMAGES)

links = item['image_urls']
# links = item['image_urls']

for link in links:
mime_type = mimetypes.guess_type(link)[0]
request = requests.get(link, stream=True)
self.grid_fs.put(request.raw, contentType=mime_type,
user = spider.user if hasattr(spider,"user") else None,
rq_id = spider.rq_id if hasattr(spider,"rq_id") else None,
filename = os.path.basename(link), bucketName = "images")
# for link in links:
# mime_type = mimetypes.guess_type(link)[0]
# request = requests.get(link, stream=True)
# self.grid_fs.put(request.raw, contentType=mime_type,
# user = spider.user if hasattr(spider,"user") else None,
# rq_id = spider.rq_id if hasattr(spider,"rq_id") else None,
# filename = os.path.basename(link), bucketName = "images")

logging.debug(f"MongoDB: Inserted {item['image_urls']}.")
# logging.debug(f"MongoDB: Inserted {item['image_urls']}.")

return item
# return item



class MongoDBFilesPipeline(object):
# class MongoDBFilesPipeline(object):

def __init__(self, MONGO_URI, MONGODB_DB, MONGODB_COLLECTION_FILES, MONGO_USERNAME='admin', MONGO_PASSWORD=''):
self.MONGO_URI = MONGO_URI
self.MONGODB_DB = MONGODB_DB
self.MONGODB_COLLECTION_FILES = MONGODB_COLLECTION_FILES
self.MONGO_USERNAME = MONGO_USERNAME
self.MONGO_PASSWORD = MONGO_PASSWORD
# def __init__(self, MONGO_URI, MONGODB_DB, MONGODB_COLLECTION_FILES, MONGO_USERNAME='admin', MONGO_PASSWORD=''):
# self.MONGO_URI = MONGO_URI
# self.MONGODB_DB = MONGODB_DB
# self.MONGODB_COLLECTION_FILES = MONGODB_COLLECTION_FILES
# self.MONGO_USERNAME = MONGO_USERNAME
# self.MONGO_PASSWORD = MONGO_PASSWORD

@classmethod
def from_crawler(cls, crawler):
# pull in information from settings.py
return cls(
MONGO_URI=crawler.settings.get('MONGO_URI'),
MONGODB_DB=crawler.settings.get('MONGODB_DB'),
MONGODB_COLLECTION_FILES=crawler.settings.get('MONGODB_COLLECTION_FILES'),
MONGO_USERNAME = crawler.settings.get('MONGO_USERNAME'),
MONGO_PASSWORD = crawler.settings.get('MONGO_PASSWORD')
)
# @classmethod
# def from_crawler(cls, crawler):
# # pull in information from settings.py
# return cls(
# MONGO_URI=crawler.settings.get('MONGO_URI'),
# MONGODB_DB=crawler.settings.get('MONGODB_DB'),
# MONGODB_COLLECTION_FILES=crawler.settings.get('MONGODB_COLLECTION_FILES'),
# MONGO_USERNAME = crawler.settings.get('MONGO_USERNAME'),
# MONGO_PASSWORD = crawler.settings.get('MONGO_PASSWORD')
# )


def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
connection = pymongo.MongoClient(
self.MONGO_URI,
username=self.MONGO_USERNAME,
password=self.MONGO_PASSWORD
)
self.db = connection[self.MONGODB_DB]
print("CONNECTED TO MONGO DB")
#self.collection = self.db[self.MONGODB_COLLECTION_FILES]
# def process_item(self, item, spider):
# valid = True
# for data in item:
# if not data:
# valid = False
# raise DropItem("Missing {0}!".format(data))
# if valid:
# connection = pymongo.MongoClient(
# self.MONGO_URI,
# username=self.MONGO_USERNAME,
# password=self.MONGO_PASSWORD
# )
# self.db = connection[self.MONGODB_DB]
# print("CONNECTED TO MONGO DB")
# #self.collection = self.db[self.MONGODB_COLLECTION_FILES]

self.grid_fs = gridfs.GridFS(self.db, collection = self.MONGODB_COLLECTION_FILES)
# self.grid_fs = gridfs.GridFS(self.db, collection = self.MONGODB_COLLECTION_FILES)

links = item['file_urls']
for link in links:
mime_type = mimetypes.guess_type(link)[0]
request = requests.get(link, stream=True)
self.grid_fs.put(request.raw, contentType=mime_type,
user = spider.user if hasattr(spider,"user") else None,
rq_id = spider.rq_id if hasattr(spider,"rq_id") else None,
filename = os.path.basename(link), bucketName = "files")
# links = item['file_urls']
# for link in links:
# mime_type = mimetypes.guess_type(link)[0]
# request = requests.get(link, stream=True)
# self.grid_fs.put(request.raw, contentType=mime_type,
# user = spider.user if hasattr(spider,"user") else None,
# rq_id = spider.rq_id if hasattr(spider,"rq_id") else None,
# filename = os.path.basename(link), bucketName = "files")

logging.debug(f"MongoDB: Inserted {item['file_urls']}.")
# logging.debug(f"MongoDB: Inserted {item['file_urls']}.")

return item
# return item


class MongoDBTextPipeline(object):
Expand Down Expand Up @@ -193,78 +193,78 @@ def process_item(self, item, spider):

# TODO: add error handling

class MongoDBPipeline(object):
# class MongoDBPipeline(object):

collection_name = 'outputItems'
# collection_name = 'outputItems'

def __init__(self, mongo_uri, mongo_db, mongo_user='admin', mongo_pwd='', mongo_repl = False, mongo_repl_name=''):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.mongo_user = mongo_user
self.mongo_password = mongo_pwd
self.mongo_replication = mongo_repl
self.mongo_replica_set_name=mongo_repl_name
# def __init__(self, mongo_uri, mongo_db, mongo_user='admin', mongo_pwd='', mongo_repl = False, mongo_repl_name=''):
# self.mongo_uri = mongo_uri
# self.mongo_db = mongo_db
# self.mongo_user = mongo_user
# self.mongo_password = mongo_pwd
# self.mongo_replication = mongo_repl
# self.mongo_replica_set_name=mongo_repl_name

@classmethod
def from_crawler(cls, crawler):
# pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items'),
mongo_user=crawler.settings.get('MONGO_USERNAME'),
mongo_pwd=crawler.settings.get('MONGO_PASSWORD'),
mongo_repl=crawler.settings.get('MONGO_REPLICATION'),
mongo_repl_name=crawler.settings.get('MONGO_REPLICA_SET')
)
# @classmethod
# def from_crawler(cls, crawler):
# # pull in information from settings.py
# return cls(
# mongo_uri=crawler.settings.get('MONGO_URI'),
# mongo_db=crawler.settings.get('MONGO_DATABASE', 'items'),
# mongo_user=crawler.settings.get('MONGO_USERNAME'),
# mongo_pwd=crawler.settings.get('MONGO_PASSWORD'),
# mongo_repl=crawler.settings.get('MONGO_REPLICATION'),
# mongo_repl_name=crawler.settings.get('MONGO_REPLICA_SET')
# )

def open_spider(self, spider):
# initializing spider
# opening db connection
print("MONGO URI: " + str(self.mongo_uri))
if self.mongo_replication:
self.client = pymongo.MongoClient(self.mongo_uri, replicaSet = self.mongo_replica_set_name, username = self.mongo_user, password = self.mongo_password)
else:
self.client = pymongo.MongoClient(self.mongo_uri, username=self.mongo_user, password=self.mongo_password)
print("MONGO CLIENT SET UP SUCCESSFULLY")
print("Self MONGO DB: " + str(self.mongo_db))
self.db = self.client[self.mongo_db]
print("CONNECTED TO MONGO DB")
# def open_spider(self, spider):
# # initializing spider
# # opening db connection
# print("MONGO URI: " + str(self.mongo_uri))
# if self.mongo_replication:
# self.client = pymongo.MongoClient(self.mongo_uri, replicaSet = self.mongo_replica_set_name, username = self.mongo_user, password = self.mongo_password)
# else:
# self.client = pymongo.MongoClient(self.mongo_uri, username=self.mongo_user, password=self.mongo_password)
# print("MONGO CLIENT SET UP SUCCESSFULLY")
# print("Self MONGO DB: " + str(self.mongo_db))
# self.db = self.client[self.mongo_db]
# print("CONNECTED TO MONGO DB")

def close_spider(self, spider):
# clean up when spider is closed
self.client.close()
print("Mongo Client closed")
# def close_spider(self, spider):
# # clean up when spider is closed
# self.client.close()
# print("Mongo Client closed")

def process_item(self, item, spider):
"""
For each CharterItem item, insert the item into the specified
collection of the MongoDB database. If the item
already exists, replace it (this prevents duplicates).
# def process_item(self, item, spider):
# """
# For each CharterItem item, insert the item into the specified
# collection of the MongoDB database. If the item
# already exists, replace it (this prevents duplicates).

To check if an item already exists, filter by the item's
url field.
"""
print("Processing item...")
# Only store CharterItems.
# To check if an item already exists, filter by the item's
# url field.
# """
# print("Processing item...")
# # Only store CharterItems.

adapted_item = ItemAdapter(item).asdict()
adapted_item.update({
"user": spider.user if hasattr(spider,"user") else None,
"rq_id": spider.rq_id if hasattr(spider,"rq_id") else None
})
# adapted_item = ItemAdapter(item).asdict()
# adapted_item.update({
# "user": spider.user if hasattr(spider,"user") else None,
# "rq_id": spider.rq_id if hasattr(spider,"rq_id") else None
# })

if not isinstance(item, CharterItem):
print("Not an instance of CharterItem")
print(item['url'])
self.db['otherItems'].replace_one({'url': item['url']}, adapted_item, upsert=True)
return item
# Finds the document with the matching url.
query = {'url': item['url']}
# upsert=True means insert the document if the query doesn't find a match.
self.db[self.collection_name].replace_one(
query, adapted_item, upsert=True
)
# self.db[self.collection_name].insert(dict(item))
logging.debug(f"MongoDB: Inserted {item['url']}.")
return item
# if not isinstance(item, CharterItem):
# print("Not an instance of CharterItem")
# print(item['url'])
# self.db['otherItems'].replace_one({'url': item['url']}, adapted_item, upsert=True)
# return item
# # Finds the document with the matching url.
# query = {'url': item['url']}
# # upsert=True means insert the document if the query doesn't find a match.
# self.db[self.collection_name].replace_one(
# query, adapted_item, upsert=True
# )
# # self.db[self.collection_name].insert(dict(item))
# logging.debug(f"MongoDB: Inserted {item['url']}.")
# return item

4 changes: 2 additions & 2 deletions schools/schools/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@
# Item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'schools.pipelines.MongoDBImagesPipeline': 3,
'schools.pipelines.MongoDBFilesPipeline': 4,
# 'schools.pipelines.MongoDBImagesPipeline': 3,
# 'schools.pipelines.MongoDBFilesPipeline': 4,
'schools.pipelines.MongoDBTextPipeline': 300
}

Expand Down
Loading