diff --git a/CHANGELOG b/CHANGELOG index 80a8de0..9b7b508 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,8 @@ 1.1.2 ----- - - +* Add support for schema overrides on manual import. (#946) +* Fix openpyxl dependency bug. (#941) +* Fix db column-size bug for large uploads. (#945) 1.1.1 ----- diff --git a/docs/manual_imports.rst b/docs/manual_imports.rst index 6d880e6..d3639e4 100644 --- a/docs/manual_imports.rst +++ b/docs/manual_imports.rst @@ -5,21 +5,21 @@ Manually importing large datasets When you need to manual import ============================== -The PANDA web interface may fail when you attempt to upload very large datasets. The exact size at which the uploads will fail depends on the specifics of your server (RAM size, in particular), but anything larger than 100MB may be a problem. +The PANDA web interface may fail when you attempt to upload very large datasets. The exact size at which the uploads will fail depends on the specifics of your server (RAM size, in particular), but anything larger than 100MB may be a problem. PANDA may also experience issues when re-indexing very large datasets for the purpose of enabling field-level search. -If you experience problems uploading large files, this document describes an alternative way of uploading them that bypasses the web interface. This method is much less convenient, but should be accessible for intermediate to advanced PANDA operators. +If you experience either of these problems, this document describes an alternative way of uploading data that bypasses the web interface. This method is much less convenient, but should be accessible for intermediate to advanced PANDA operators. Uploading a file to your server -------------------------------- +=============================== Manually importing files is a two-step process. First you must upload them to your server, then you can execute the import process. -Uploading files your server requires using a command-line program called ``scp``. This program allows you to send a file to your server over :doc:`SSH `. It may help to quickly review the :doc:`SSH ` documentation now. If you are on Mac/Linux, `scp` comes preinstalled. On Windows it comes as part of `Putty `_. In either case, the command to upload your file will look like: +Uploading files to your server requires using a command-line program called ``scp``. This program allows you to send a file to your server over :doc:`SSH `. It may help to quickly review the :doc:`SSH ` documentation now. If you are on Mac/Linux, `scp` comes preinstalled. On Windows it comes as part of `Putty `_. In either case, the command to upload your file will look like: ``scp -i /path/to/my/ec2_key.pem /path/to/my/dataset.csv ubuntu@my_server_domain_name.com:/tmp/`` Executing the manual import --------------------------- +=========================== Once your file has finished copying to your PANDA server, you will need to SSH in to execute the manual import process. Refer to the :doc:`SSH ` documentation for instructions on how to SSH in. Once you're at the command line on your server, execute the following commands to import your file: @@ -37,3 +37,64 @@ Once your file has finished copying to your PANDA server, you will need to SSH i In the example ``dataset.csv`` is the name of the file you uploaded (not including the path) and ``user@email.com`` is the login of the user you want the to "own" the dataset. Once this script returns your file will be importing via the normal process and you can review it's progress via the web interface. The dataset name and description will be set to the system defaults and should be updated in the web interface. From this point forward the dataset should be indistinguishable from one uploaded via the normal process. + + +Enabling field search during bulk load +======================================= + +PANDA may have trouble re-indexing "large" datasets, typically of millions of rows or more. Re-indexing is performed when you add field-level search to a dataset after initial import. +If you have trouble re-indexing a large dataset, you can supply the bulk import command with a schema override file that enables field-level search during initial import. + +.. code-block:: bash + + sudo mv /tmp/dataset.csv /var/lib/panda/uploads/ + sudo chown panda:panda /var/lib/panda/uploads/dataset.csv + cd /opt/panda + sudo -u panda -E python manage.py manual_import dataset.csv user@email.com -o /path/to/schema_overrides.csv + + +Schema override file format +---------------------------- + +The schema override file provides the ability to enable field-level search and customize the data types for any combination of fields. The override file should be a simple comma-separated CSV with two columns: + +- **field name** (required) must precisely match corresponding field name in source data file (note, match is case sensitive!) +- **data type** (optional) is a valid PANDA data type. Otherwise uses PANDA's defaults: + + - unicode + - int + - float + - bool + - datetime + - date + - time + +When defining a schema override file, it's a good idea to test a smaller sample of data to ensure you have the correct column names and data types. +PANDA will often guess the right data type for a column based on a sampling of data. However, this may not always work as expected, +such as a salary field prefixed with a dollar sign (PANDA will treat this as a string rather than interpreting it as a float). + +Experimenting with a subset of data will help identify such issues and suggest potential pre-processing steps that might be necessary prior +to final import (e.g. stripping a leading dollar sign from a currency field). + +Once you've ironed out such kinks on the smaller data slice, you can apply the schema overrides to the full data set. + +Below is a sample data set and schema override file. + +.. code-block:: bash + + # my_sample_data.csv + name,birthdate,salary,zip + John,1990-01-01,55000,20007 + Jane,1989-01-01,65000,20007 + +The related schema override file (below) would add indexes on *birthdate*, *salary* and *zip*. + +.. code-block:: bash + + # schema_overrides.csv + birthdate, + salary, + zip,unicode + +In this example, PANDA correctly assigns data types for *birthdate* and *salary*, so we can leave the data type column blank for those fields. +However, we explicitly specify *unicode* for zip code to ensure it is treated as a string rather than an integer. diff --git a/panda/management/commands/manual_import.py b/panda/management/commands/manual_import.py index 44e2063..45a2c92 100644 --- a/panda/management/commands/manual_import.py +++ b/panda/management/commands/manual_import.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +import csv import os from django.conf import settings @@ -7,12 +8,22 @@ from django.utils.translation import ugettext as _ from livesettings import config_value +from optparse import make_option from panda.models import Dataset, DataUpload, UserProxy +from panda.utils.typecoercion import TYPE_NAMES_MAPPING class Command(BaseCommand): args = '' help = _('Manually import data for when the web UI fails. See http://panda.readthedocs.org/en/latest/manual_imports.html') + option_list = BaseCommand.option_list + ( + make_option('-o', '--schema_overrides', + action='store', + dest='overrides', + help=_('Full path to CSV containing schema overrides. Field types: %s' % ', '.join(sorted(TYPE_NAMES_MAPPING.keys()))) + ), + ) + def handle(self, *args, **options): if len(args) < 2: self.stderr.write(_('You must specify a filename and user.\n')) @@ -20,6 +31,7 @@ def handle(self, *args, **options): filename = args[0] email = args[1] + overrides = self._schema_overrides(options) path = os.path.join(settings.MEDIA_ROOT, filename) @@ -42,7 +54,7 @@ def handle(self, *args, **options): creator=creator, dataset=None, encoding='utf-8') - + dataset = Dataset.objects.create( name=filename, creator=creator, @@ -50,8 +62,25 @@ def handle(self, *args, **options): self.stdout.write('%s http://%s/#dataset/%s\n' % (_('Dataset created:'), config_value('DOMAIN', 'SITE_DOMAIN'), dataset.slug)) - dataset.import_data(creator, upload) - + dataset.import_data(creator, upload, schema_overrides=overrides) + dataset.update_full_text() self.stdout.write(_('Import started. Check dataset page for progress.\n')) + + def _schema_overrides(self, opts): + try: + fields_file = opts['overrides'] + except KeyError: + return {} + #TODO: error-handling if file doesn't exist or is malformed + valid_types = set(TYPE_NAMES_MAPPING.keys()) + with open(fields_file) as csvfile: + data = {} + for field, dtype in csv.reader(csvfile): + # Activate indexing + data[field] = { 'indexed': True } + # Update data type if provided and valid + if dtype in valid_types: + data[field]['type'] = dtype + return data diff --git a/panda/migrations/0034_auto__chg_field_relatedupload_size__chg_field_export_size__chg_field_d.py b/panda/migrations/0034_auto__chg_field_relatedupload_size__chg_field_export_size__chg_field_d.py new file mode 100644 index 0000000..63da042 --- /dev/null +++ b/panda/migrations/0034_auto__chg_field_relatedupload_size__chg_field_export_size__chg_field_d.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- +import datetime +from south.db import db +from south.v2 import SchemaMigration +from django.db import models + + +class Migration(SchemaMigration): + + def forwards(self, orm): + + # Changing field 'RelatedUpload.size' + db.alter_column('panda_relatedupload', 'size', self.gf('django.db.models.fields.BigIntegerField')()) + + # Changing field 'Export.size' + db.alter_column('panda_export', 'size', self.gf('django.db.models.fields.BigIntegerField')()) + + # Changing field 'DataUpload.size' + db.alter_column('panda_dataupload', 'size', self.gf('django.db.models.fields.BigIntegerField')()) + def backwards(self, orm): + + # Changing field 'RelatedUpload.size' + db.alter_column('panda_relatedupload', 'size', self.gf('django.db.models.fields.IntegerField')()) + + # Changing field 'Export.size' + db.alter_column('panda_export', 'size', self.gf('django.db.models.fields.IntegerField')()) + + # Changing field 'DataUpload.size' + db.alter_column('panda_dataupload', 'size', self.gf('django.db.models.fields.IntegerField')()) + models = { + 'auth.group': { + 'Meta': {'object_name': 'Group'}, + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '80'}), + 'permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'}) + }, + 'auth.permission': { + 'Meta': {'ordering': "('content_type__app_label', 'content_type__model', 'codename')", 'unique_together': "(('content_type', 'codename'),)", 'object_name': 'Permission'}, + 'codename': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'content_type': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['contenttypes.ContentType']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '50'}) + }, + 'auth.user': { + 'Meta': {'object_name': 'User'}, + 'date_joined': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), + 'email': ('django.db.models.fields.EmailField', [], {'max_length': '75', 'blank': 'True'}), + 'first_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}), + 'groups': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Group']", 'symmetrical': 'False', 'blank': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'is_active': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), + 'is_staff': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'is_superuser': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'last_login': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), + 'last_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}), + 'password': ('django.db.models.fields.CharField', [], {'max_length': '128'}), + 'user_permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'}), + 'username': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '255'}) + }, + 'contenttypes.contenttype': { + 'Meta': {'ordering': "('name',)", 'unique_together': "(('app_label', 'model'),)", 'object_name': 'ContentType', 'db_table': "'django_content_type'"}, + 'app_label': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'model': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '100'}) + }, + 'panda.activitylog': { + 'Meta': {'unique_together': "(('user', 'when'),)", 'object_name': 'ActivityLog'}, + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'user': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'activity_logs'", 'to': "orm['auth.User']"}), + 'when': ('django.db.models.fields.DateField', [], {'auto_now': 'True', 'blank': 'True'}) + }, + 'panda.category': { + 'Meta': {'object_name': 'Category'}, + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '64'}), + 'slug': ('django.db.models.fields.SlugField', [], {'max_length': '256'}) + }, + 'panda.dataset': { + 'Meta': {'ordering': "['-creation_date']", 'object_name': 'Dataset'}, + 'categories': ('django.db.models.fields.related.ManyToManyField', [], {'blank': 'True', 'related_name': "'datasets'", 'null': 'True', 'symmetrical': 'False', 'to': "orm['panda.Category']"}), + 'column_schema': ('panda.fields.JSONField', [], {'default': 'None', 'null': 'True'}), + 'creation_date': ('django.db.models.fields.DateTimeField', [], {'null': 'True'}), + 'creator': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'datasets'", 'to': "orm['auth.User']"}), + 'current_task': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['panda.TaskStatus']", 'null': 'True', 'blank': 'True'}), + 'description': ('django.db.models.fields.TextField', [], {'blank': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'initial_upload': ('django.db.models.fields.related.ForeignKey', [], {'blank': 'True', 'related_name': "'initial_upload_for'", 'null': 'True', 'to': "orm['panda.DataUpload']"}), + 'last_modification': ('django.db.models.fields.TextField', [], {'default': 'None', 'null': 'True', 'blank': 'True'}), + 'last_modified': ('django.db.models.fields.DateTimeField', [], {'default': 'None', 'null': 'True', 'blank': 'True'}), + 'last_modified_by': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['auth.User']", 'null': 'True', 'blank': 'True'}), + 'locked': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'locked_at': ('django.db.models.fields.DateTimeField', [], {'default': 'None', 'null': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '256'}), + 'related_links': ('panda.fields.JSONField', [], {'default': '[]'}), + 'row_count': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'blank': 'True'}), + 'sample_data': ('panda.fields.JSONField', [], {'default': 'None', 'null': 'True'}), + 'slug': ('django.db.models.fields.SlugField', [], {'max_length': '256'}) + }, + 'panda.dataupload': { + 'Meta': {'ordering': "['creation_date']", 'object_name': 'DataUpload'}, + 'columns': ('panda.fields.JSONField', [], {'null': 'True'}), + 'creation_date': ('django.db.models.fields.DateTimeField', [], {}), + 'creator': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['auth.User']"}), + 'data_type': ('django.db.models.fields.CharField', [], {'max_length': '4', 'null': 'True', 'blank': 'True'}), + 'dataset': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'data_uploads'", 'null': 'True', 'to': "orm['panda.Dataset']"}), + 'deletable': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), + 'dialect': ('panda.fields.JSONField', [], {'null': 'True'}), + 'encoding': ('django.db.models.fields.CharField', [], {'default': "'utf-8'", 'max_length': '32'}), + 'filename': ('django.db.models.fields.CharField', [], {'max_length': '256'}), + 'guessed_types': ('panda.fields.JSONField', [], {'null': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'imported': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'original_filename': ('django.db.models.fields.CharField', [], {'max_length': '256'}), + 'sample_data': ('panda.fields.JSONField', [], {'null': 'True'}), + 'size': ('django.db.models.fields.BigIntegerField', [], {}), + 'title': ('django.db.models.fields.TextField', [], {'max_length': '256'}) + }, + 'panda.export': { + 'Meta': {'ordering': "['creation_date']", 'object_name': 'Export'}, + 'creation_date': ('django.db.models.fields.DateTimeField', [], {}), + 'creator': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['auth.User']"}), + 'dataset': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'exports'", 'null': 'True', 'to': "orm['panda.Dataset']"}), + 'filename': ('django.db.models.fields.CharField', [], {'max_length': '256'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'original_filename': ('django.db.models.fields.CharField', [], {'max_length': '256'}), + 'size': ('django.db.models.fields.BigIntegerField', [], {}), + 'title': ('django.db.models.fields.TextField', [], {'max_length': '256'}) + }, + 'panda.notification': { + 'Meta': {'ordering': "['-sent_at']", 'object_name': 'Notification'}, + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'message': ('django.db.models.fields.TextField', [], {}), + 'read_at': ('django.db.models.fields.DateTimeField', [], {'default': 'None', 'null': 'True', 'blank': 'True'}), + 'recipient': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'notifications'", 'to': "orm['auth.User']"}), + 'sent_at': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}), + 'type': ('django.db.models.fields.CharField', [], {'default': "'Info'", 'max_length': '16'}), + 'url': ('django.db.models.fields.URLField', [], {'default': 'None', 'max_length': '200', 'null': 'True'}) + }, + 'panda.relatedupload': { + 'Meta': {'ordering': "['creation_date']", 'object_name': 'RelatedUpload'}, + 'creation_date': ('django.db.models.fields.DateTimeField', [], {}), + 'creator': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['auth.User']"}), + 'dataset': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'related_uploads'", 'to': "orm['panda.Dataset']"}), + 'filename': ('django.db.models.fields.CharField', [], {'max_length': '256'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'original_filename': ('django.db.models.fields.CharField', [], {'max_length': '256'}), + 'size': ('django.db.models.fields.BigIntegerField', [], {}), + 'title': ('django.db.models.fields.TextField', [], {'max_length': '256'}) + }, + 'panda.searchlog': { + 'Meta': {'object_name': 'SearchLog'}, + 'dataset': ('django.db.models.fields.related.ForeignKey', [], {'default': 'None', 'related_name': "'searches'", 'null': 'True', 'to': "orm['panda.Dataset']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'query': ('django.db.models.fields.CharField', [], {'max_length': '4096'}), + 'user': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'search_logs'", 'to': "orm['auth.User']"}), + 'when': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}) + }, + 'panda.searchsubscription': { + 'Meta': {'object_name': 'SearchSubscription'}, + 'category': ('django.db.models.fields.related.ForeignKey', [], {'default': 'None', 'related_name': "'search_subscriptions'", 'null': 'True', 'to': "orm['panda.Category']"}), + 'dataset': ('django.db.models.fields.related.ForeignKey', [], {'default': 'None', 'related_name': "'search_subscriptions'", 'null': 'True', 'to': "orm['panda.Dataset']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'last_run': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}), + 'query': ('django.db.models.fields.CharField', [], {'max_length': '256'}), + 'query_human': ('django.db.models.fields.TextField', [], {}), + 'query_url': ('django.db.models.fields.CharField', [], {'max_length': '256'}), + 'user': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'search_subscriptions'", 'to': "orm['auth.User']"}) + }, + 'panda.taskstatus': { + 'Meta': {'object_name': 'TaskStatus'}, + 'creator': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'tasks'", 'null': 'True', 'to': "orm['auth.User']"}), + 'end': ('django.db.models.fields.DateTimeField', [], {'null': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'message': ('django.db.models.fields.CharField', [], {'max_length': '255', 'blank': 'True'}), + 'start': ('django.db.models.fields.DateTimeField', [], {'null': 'True'}), + 'status': ('django.db.models.fields.CharField', [], {'default': "'PENDING'", 'max_length': '50'}), + 'task_description': ('django.db.models.fields.TextField', [], {}), + 'task_name': ('django.db.models.fields.CharField', [], {'max_length': '255'}), + 'traceback': ('django.db.models.fields.TextField', [], {'default': 'None', 'null': 'True', 'blank': 'True'}) + }, + 'panda.userprofile': { + 'Meta': {'object_name': 'UserProfile'}, + 'activation_key': ('django.db.models.fields.CharField', [], {'max_length': '40', 'null': 'True', 'blank': 'True'}), + 'activation_key_expiration': ('django.db.models.fields.DateTimeField', [], {}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'show_login_help': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), + 'user': ('django.db.models.fields.related.OneToOneField', [], {'to': "orm['auth.User']", 'unique': 'True'}) + } + } + + complete_apps = ['panda'] \ No newline at end of file diff --git a/panda/models/base_upload.py b/panda/models/base_upload.py index c5cd63a..0e67fde 100644 --- a/panda/models/base_upload.py +++ b/panda/models/base_upload.py @@ -18,7 +18,7 @@ class BaseUpload(models.Model): original_filename = models.CharField(_('original_filename'), max_length=256, help_text=_('Filename as originally uploaded.')) - size = models.IntegerField(_('size'), + size = models.BigIntegerField(_('size'), help_text=_('Size of the file in bytes.')) creator = models.ForeignKey(UserProxy, help_text=_('The user who uploaded this file.'), diff --git a/panda/models/dataset.py b/panda/models/dataset.py index c0b2d6c..e05014c 100644 --- a/panda/models/dataset.py +++ b/panda/models/dataset.py @@ -177,7 +177,7 @@ def delete(self, *args, **kwargs): super(Dataset, self).delete(*args, **kwargs) - def import_data(self, user, upload, external_id_field_index=None): + def import_data(self, user, upload, external_id_field_index=None, schema_overrides = {}): """ Import data into this ``Dataset`` from a given ``DataUpload``. """ @@ -198,8 +198,8 @@ def import_data(self, user, upload, external_id_field_index=None): if upload.columns != [c['name'] for c in self.column_schema]: raise DataImportError(_('The columns in this file do not match those in the dataset.')) else: - self.column_schema = make_column_schema(upload.columns, types=upload.guessed_types) - + self.column_schema = make_column_schema(upload.columns, types=upload.guessed_types, overrides=schema_overrides) + if self.sample_data is None: self.sample_data = upload.sample_data diff --git a/panda/tests/test_data_upload.py b/panda/tests/test_data_upload.py index 7cef156..33e67f4 100644 --- a/panda/tests/test_data_upload.py +++ b/panda/tests/test_data_upload.py @@ -2,6 +2,7 @@ import os.path +from django.db import DatabaseError from django.conf import settings from django.test import TransactionTestCase @@ -39,6 +40,17 @@ def test_created(self): self.assertEqual(upload.deletable, True) + def test_create_large_file(self): + # Max number capable of storage in Postgres integer field, plus 1 + # (errors out with IntegerField; passes with BigInt) + upload = utils.get_test_data_upload(self.user, self.dataset, size=2147483648) + # Test BigInt outer boundaries + # Max bigint number + upload2 = utils.get_test_data_upload(self.user, self.dataset, size=9223372036854775807) + self.assertEqual(upload2.size, 9223372036854775807) + # Max bigint + 1 + self.assertRaises(DatabaseError, utils.get_test_data_upload, self.user, self.dataset, size=9223372036854775808) + def test_delete(self): upload = utils.get_test_data_upload(self.user, self.dataset) upload_id = upload.id diff --git a/panda/tests/test_dataset.py b/panda/tests/test_dataset.py index 911cc94..0515629 100644 --- a/panda/tests/test_dataset.py +++ b/panda/tests/test_dataset.py @@ -164,6 +164,40 @@ def test_import_oo_xlsx(self): self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1) + def test_import_csv_with_schema_overrides(self): + + overrides = { + 'id': {'indexed': True, 'type': 'float'}, + 'last_name': {'indexed': True}, + } + self.dataset.import_data(self.user, self.upload, schema_overrides=overrides) + + task = self.dataset.current_task + + self.assertNotEqual(task, None) + self.assertNotEqual(task.id, None) + self.assertEqual(task.task_name, 'panda.tasks.import.csv') + + # Refresh from database + dataset = Dataset.objects.get(id=self.dataset.id) + upload = DataUpload.objects.get(id=self.upload.id) + task = TaskStatus.objects.get(id=task.id) + + self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer']) + self.assertEqual(upload.guessed_types, ['int', 'unicode', 'unicode', 'unicode']) + #NOTE: Without overrides, float value for type would be "int" (per guessed_types) and all indexed_names would be None + self.assertEqual([c['type'] for c in dataset.column_schema], ['float', 'unicode', 'unicode', 'unicode']) + self.assertEqual([c['indexed_name'] for c in dataset.column_schema], ['column_float_id', None, 'column_unicode_last_name', None]) + self.assertEqual(dataset.row_count, 4) + self.assertEqual(upload.imported, True) + self.assertEqual(task.status, 'SUCCESS') + self.assertNotEqual(task.start, None) + self.assertNotEqual(task.end, None) + self.assertEqual(task.traceback, None) + self.assertEqual(dataset.locked, False) + + self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1) + def test_import_additional_data_same_columns(self): self.dataset.import_data(self.user, self.upload) diff --git a/panda/tests/utils.py b/panda/tests/utils.py index 3678886..85d1b1a 100644 --- a/panda/tests/utils.py +++ b/panda/tests/utils.py @@ -52,7 +52,7 @@ def get_test_dataset(creator): return dataset -def get_test_data_upload(creator, dataset, filename=TEST_DATA_FILENAME, encoding='utf8'): +def get_test_data_upload(creator, dataset, filename=TEST_DATA_FILENAME, encoding='utf8', size=None): # Ensure panda subdir has been created try: os.mkdir(settings.MEDIA_ROOT) @@ -66,7 +66,7 @@ def get_test_data_upload(creator, dataset, filename=TEST_DATA_FILENAME, encoding return DataUpload.objects.create( filename=filename, original_filename=filename, - size=os.path.getsize(dst), + size=size or os.path.getsize(dst), creator=creator, dataset=dataset, encoding=encoding) diff --git a/panda/utils/column_schema.py b/panda/utils/column_schema.py index 259a5a0..3ec5a4b 100644 --- a/panda/utils/column_schema.py +++ b/panda/utils/column_schema.py @@ -44,7 +44,7 @@ def update_indexed_names(column_schema): return column_schema -def make_column_schema(columns, indexed=None, types=None): +def make_column_schema(columns, indexed=None, types=None, overrides = {}): """ Generate a column schema from parallel arrays of columns, index booleans, and index types. """ @@ -59,6 +59,11 @@ def make_column_schema(columns, indexed=None, types=None): 'min': None, 'max': None } + # Apply overrides for schema and index options + try: + c.update(overrides[name]) + except KeyError: + pass column_schema.append(c)