인문지식 처리와 프로그래밍2020 4.16

soook
이동: 둘러보기, 검색

CRAWLing

공공의 이익을 위해서가 아니면 안된다!!!

믄화재


#!/usr/bin/python

import sys
import os
import urllib.request
import shutil

address = "http://www.heritage.go.kr/heri/cul/culSelectDetail.do?ccbaCpno="  #문화재청 국가문화유산포털문화재
	
def downImage( document, sequence, urlstring ):

	checkline = urlstring.upper()
	begin = checkline.find("HTTP:")
	end = checkline.find(".JPG")+4
	url = urlstring[begin:end]
	
	filename=document[:-4]
	suffix = "{:03}.jpg".format(sequence)
	path = "{0}-{1}".format(filename, suffix)
	
	try:
		urllib.request.urlretrieve(url, path)
	except:
		shutil.copy2('noimage.jpg', path)


def findImage( path ):

	g = open( path, 'rt', encoding='UTF8')
	
	found = 0
		
	while 1:
		line = g.readline()
		if not line: break	
		checkline = line.upper()
		if( checkline.find(".JPG") != -1 ):
			whatIfound = line.strip()
			downImage( path, found, whatIfound )
			# found += 1
			return
			
	if found == 0:
		print( "{0}: No Image".format(path))
		path = "{0}-000.jpg".format(path[:-4])
		shutil.copy2('noimage.jpg', path)		
	g.close()
	
def downHtml( folder, file ):

	url = address.__add__(file)
	path = folder + '/' + file + '.htm'

	try:
		urllib.request.urlretrieve(url, path)
	except:
		print( "{0}: Invalid Cpno!".format( file ) )
		return
	
	findImage( path )

def main():

	try:
		filename = sys.argv[1]
	except:
		return
		
	list = filename.__add__( ".lst" )
	folder = filename

	try:
		os.makedirs( folder )
	except OSError:
		pass
		
	i = 0			
	f = open( list )

	while 1:
		line = f.readline()
		if not line: break
		file = line.strip()
		print( "{0}: Processing....".format(file))
		downHtml( folder, file )
		
	f.close()

main()


민족문화대백과사전

#!/usr/bin/python

import sys
import os
import urllib.request
import shutil

address1 = "http://encykorea.aks.ac.kr/Contents/Item/"	#민족문화대백과사전 기사 Url
address2 = "http://encykorea.aks.ac.kr/Contents/GetImage?id=" #민족문화대백과사전 이미지 Url

def downImage( document, sequence, urlstring ):

	url = address2.__add__(urlstring)
	filename=document[:-4]
	suffix = "{:03}.jpg".format(sequence)
	path = "{0}-{1}".format(filename, suffix)
	
	try:
		urllib.request.urlretrieve(url, path)
	except:
		shutil.copy2('noimage.jpg', path)


def findImage( path ):

	#src="/Contents/GetImage?id=ceb01938-4ed4-4bd8-ac91-e884b7fc0460&w=260&h=260&fit=w&clip=1" 
	#multi_item = {mmid:'2a71cba8-eba7-4e1d-b4b0-30818c766013'

	g = open( path, 'rt', encoding='UTF8')
	
	found = 0
		
	while 1:
		line = g.readline()
		if not line: break	

		a=line.find("/GetImage?id=")
		if( a != -1 ):
			b=line.find("&w=")
			whatIfound=line[a+13:b]
			downImage( path, found, whatIfound )
			found += 1
			return
			
	if found == 0:
		print( "{0}: No Image".format(path))
		path = "{0}-000.jpg".format(path[:-4])
		shutil.copy2('noimage.jpg', path)
		
	g.close()
	
def downHtml( folder, file ):

	url = address1.__add__(file)
	path = folder + '/' + file + '.htm'

	try:
		urllib.request.urlretrieve(url, path)
	except:
		print( "{0}: Invalid ID!".format( file ) )
		return
	
	findImage( path )

def main():

	try:
		filename = sys.argv[1]
	except:
		return
		
	list = filename.__add__( ".lst" )
	folder = filename

	try:
		os.makedirs( folder )
	except OSError:
		pass
		
	i = 0			
	f = open( list )

	while 1:
		line = f.readline()
		if not line: break
		file = line.strip()
		print( "{0}: Processing....".format(file))
		downHtml( folder, file )
		
	f.close()

main()